Hello, reader! Thank you for your interest in this work.
Let me guide you through the steps taken to conduct the statistical analyses reported on our paper.
Loading libraries
Here are the libraries that we are going to employ. If you have not
already, remember to install them via install.packages()
first.
library(tidyverse)
library(cowplot)
library(randomForest)
library(caret)
library(networkD3)
library(gridExtra)
library(RColorBrewer)
library(extrafont)
library(corrplot)
library(sjmisc)
library(sjlabelled)
library(ggridges)
library(reshape2)
Loading data sets
Next, we are going to load the three data sets obtained after computing the measures needed for our model with Python: one .CSV file for the graded readers corpus, another one for the literary works corpus, and the last one for the reference corpus of general Spanish. Furthermore, we will also load some stats computed by graded lexical item.
# === GRADED READERS ===
# Corpus
readers_corpus <- read_csv('Data/readers.csv')
# Vocabulary stats
readers_vocab_beginner <- read_csv('Data/Vocabulary/beginner.csv')
readers_vocab_intermediate <- read_csv('Data/Vocabulary/intermediate.csv')
readers_vocab_advanced <- read_csv('Data/Vocabulary/advanced.csv')
# === LITERARY WORKS ===
# Corpus
literature_corpus <- read_csv('Data/literature.csv')
# Vocabulary stats
literature_vocab_children <- read_csv('Data/Vocabulary/children.csv')
literature_vocab_youth <- read_csv('Data/Vocabulary/youth.csv')
literature_vocab_adult <- read_csv('Data/Vocabulary/adult.csv')
# === REFERENCE CORPUS ===
# Corpus
reference_corpus <- read_csv('Data/reference.csv')
# Vocabulary stats
reference_vocab <- read_csv('Data/Vocabulary/reference.csv')
Getting the data sets nice and ready
The original data sets have way too many columns. Now we are going to select only the ones relevant to the statistical analyses. We are also going to add up the column values of auxiliary and regular verb features.
# ================================ GRADED READERS DATA SET ================================
# STEP 1. Select relevant columns
selected_columns <- c("Freq A1", "Freq A2", "Freq B1", "Freq B2",
"TFIDF A1", "TFIDF A2", "TFIDF B1", "TFIDF B2",
"Tree", "Context freq", "Context TFIDF", "Context tree",
"deprel-mark DOC%", "deprel-advcl DOC%", "deprel-xcomp DOC%",
"deprel-ccomp DOC%", "deprel-acl DOC%", "deprel-nmod DOC%",
"deprel-conj DOC%", "deprel-appos DOC%",
"upos-AUX-Tense-Pres DOC%", "upos-AUX-Tense-Past DOC%",
"upos-AUX-Tense-Fut DOC%", "upos-AUX-Tense-Imp DOC%",
"upos-AUX-Mood-Ind DOC%", "upos-AUX-Mood-Sub DOC%",
"upos-AUX-Mood-Cnd DOC%", "upos-VERB-Tense-Pres DOC%",
"upos-VERB-Tense-Past DOC%", "upos-VERB-Tense-Fut DOC%",
"upos-VERB-Tense-Imp DOC%", "upos-VERB-Mood-Ind DOC%",
"upos-VERB-Mood-Sub DOC%", "upos-VERB-Mood-Cnd DOC%",
"Lexical density", "msttr", "mtld", "hdd", "Herdan")
readers_corpus <- readers_corpus %>% dplyr::select("Title", "Level",
all_of(selected_columns))
# STEP 2.1 Add up A1 and A2 vocabulary
readers_corpus <- readers_corpus %>%
mutate(`Freq A1-A2` = `Freq A1` + `Freq A2`) %>%
select(1:(match("Freq A1", names(.)) - 1), # Columns before "Freq A1"
`Freq A1-A2`, # New column
(match("Freq A2", names(.)) + 1):ncol(.)) # Columns after "Freq A2"
readers_corpus <- readers_corpus %>%
mutate(`TFIDF A1-A2` = `TFIDF A1` + `TFIDF A2`) %>%
select(1:(match("TFIDF A1", names(.)) - 1), # Columns before "Freq A1"
`TFIDF A1-A2`, # New column
(match("TFIDF A2", names(.)) + 1):ncol(.)) # Columns after "Freq A2"
# STEP 2.2 Add up auxiliary and regular verb features in a single column
readers_corpus$`upos-VERB-Mood-Ind DOC%` <-
readers_corpus$`upos-AUX-Mood-Ind DOC%` + readers_corpus$`upos-VERB-Mood-Ind DOC%`
readers_corpus$`upos-VERB-Mood-Sub DOC%` <-
readers_corpus$`upos-AUX-Mood-Sub DOC%` + readers_corpus$`upos-VERB-Mood-Sub DOC%`
readers_corpus$`upos-VERB-Mood-Cnd DOC%` <-
readers_corpus$`upos-AUX-Mood-Cnd DOC%` + readers_corpus$`upos-VERB-Mood-Cnd DOC%`
readers_corpus$`upos-VERB-Tense-Pres DOC%` <-
readers_corpus$`upos-AUX-Tense-Pres DOC%` + readers_corpus$`upos-VERB-Tense-Pres DOC%`
readers_corpus$`upos-VERB-Tense-Imp DOC%` <-
readers_corpus$`upos-AUX-Tense-Imp DOC%` + readers_corpus$`upos-VERB-Tense-Imp DOC%`
readers_corpus$`upos-VERB-Tense-Past DOC%` <-
readers_corpus$`upos-AUX-Tense-Past DOC%` + readers_corpus$`upos-VERB-Tense-Past DOC%`
readers_corpus$`upos-VERB-Tense-Fut DOC%` <-
readers_corpus$`upos-AUX-Tense-Fut DOC%` + readers_corpus$`upos-VERB-Tense-Fut DOC%`
# STEP 3. Drop auxiliary verb feature columns
readers_corpus <- dplyr::select(readers_corpus, -c(21:27))
# ================================== LITERATURE DATA SET ==================================
# STEP 1. Select relevant columns
literature_corpus <- literature_corpus %>% dplyr::select("Title", "Level",
all_of(selected_columns))
# STEP 2.1 Add up A1 and A2 vocabulary
literature_corpus <- literature_corpus %>%
mutate(`Freq A1-A2` = `Freq A1` + `Freq A2`) %>%
select(1:(match("Freq A1", names(.)) - 1), # Columns before "Freq A1"
`Freq A1-A2`, # New column
(match("Freq A2", names(.)) + 1):ncol(.)) # Columns after "Freq A2"
literature_corpus <- literature_corpus %>%
mutate(`TFIDF A1-A2` = `TFIDF A1` + `TFIDF A2`) %>%
select(1:(match("TFIDF A1", names(.)) - 1), # Columns before "Freq A1"
`TFIDF A1-A2`, # New column
(match("TFIDF A2", names(.)) + 1):ncol(.)) # Columns after "Freq A2"
# STEP 2.2 Add up auxiliary and regular verb features in a single column
literature_corpus$`upos-VERB-Mood-Ind DOC%` <-
literature_corpus$`upos-AUX-Mood-Ind DOC%` + literature_corpus$`upos-VERB-Mood-Ind DOC%`
literature_corpus$`upos-VERB-Mood-Sub DOC%` <-
literature_corpus$`upos-AUX-Mood-Sub DOC%` + literature_corpus$`upos-VERB-Mood-Sub DOC%`
literature_corpus$`upos-VERB-Mood-Cnd DOC%` <-
literature_corpus$`upos-AUX-Mood-Cnd DOC%` + literature_corpus$`upos-VERB-Mood-Cnd DOC%`
literature_corpus$`upos-VERB-Tense-Pres DOC%` <-
literature_corpus$`upos-AUX-Tense-Pres DOC%` + literature_corpus$`upos-VERB-Tense-Pres DOC%`
literature_corpus$`upos-VERB-Tense-Imp DOC%` <-
literature_corpus$`upos-AUX-Tense-Imp DOC%` + literature_corpus$`upos-VERB-Tense-Imp DOC%`
literature_corpus$`upos-VERB-Tense-Past DOC%` <-
literature_corpus$`upos-AUX-Tense-Past DOC%` + literature_corpus$`upos-VERB-Tense-Past DOC%`
literature_corpus$`upos-VERB-Tense-Fut DOC%` <-
literature_corpus$`upos-AUX-Tense-Fut DOC%` + literature_corpus$`upos-VERB-Tense-Fut DOC%`
# STEP 3. Drop auxiliary verb feature columns
literature_corpus <- dplyr::select(literature_corpus, -c(21:27))
# =============================== REFERENCE CORPUS DATA SET ===============================
# STEP 1. Select relevant columns
reference_corpus <- reference_corpus %>% dplyr::select(all_of(selected_columns))
# STEP 2.1 Add up A1 and A2 vocabulary
reference_corpus <- reference_corpus %>%
mutate(`Freq A1-A2` = `Freq A1` + `Freq A2`) %>%
select(`Freq A1-A2`, # New column
(match("Freq A2", names(.)) + 1):ncol(.)) # Columns after "Freq A2"
reference_corpus <- reference_corpus %>%
mutate(`TFIDF A1-A2` = `TFIDF A1` + `TFIDF A2`) %>%
select(1:(match("TFIDF A1", names(.)) - 1),
`TFIDF A1-A2`, # New column
(match("TFIDF A2", names(.)) + 1):ncol(.)) # Columns after "Freq A2"
# STEP 2.2 Add up auxiliary and regular verb features in a single column
reference_corpus$`upos-VERB-Mood-Ind DOC%` <-
reference_corpus$`upos-AUX-Mood-Ind DOC%` + reference_corpus$`upos-VERB-Mood-Ind DOC%`
reference_corpus$`upos-VERB-Mood-Sub DOC%` <-
reference_corpus$`upos-AUX-Mood-Sub DOC%` + reference_corpus$`upos-VERB-Mood-Sub DOC%`
reference_corpus$`upos-VERB-Mood-Cnd DOC%` <-
reference_corpus$`upos-AUX-Mood-Cnd DOC%` + reference_corpus$`upos-VERB-Mood-Cnd DOC%`
reference_corpus$`upos-VERB-Tense-Pres DOC%` <-
reference_corpus$`upos-AUX-Tense-Pres DOC%` + reference_corpus$`upos-VERB-Tense-Pres DOC%`
reference_corpus$`upos-VERB-Tense-Imp DOC%` <-
reference_corpus$`upos-AUX-Tense-Imp DOC%` + reference_corpus$`upos-VERB-Tense-Imp DOC%`
reference_corpus$`upos-VERB-Tense-Past DOC%` <-
reference_corpus$`upos-AUX-Tense-Past DOC%` + reference_corpus$`upos-VERB-Tense-Past DOC%`
reference_corpus$`upos-VERB-Tense-Fut DOC%` <-
reference_corpus$`upos-AUX-Tense-Fut DOC%` + reference_corpus$`upos-VERB-Tense-Fut DOC%`
# STEP 3. Drop auxiliary verb feature columns
reference_corpus <- dplyr::select(reference_corpus, -c(19:25))
That’s starting to look a lot better now! However, since in the original file all parse tree properties are in a tuple in the same column, we still need to tweak the data frame columns a little bit more:
# STEP 1. REMOVE TUPLE PARENTHESES
# --------------------------------
# Function to remove parentheses
strip_parens <- function(x) gsub('[()]', '', x)
# Remove ( and ) from the beginning and the end of each tuple.
# Apply the function per row (2 = column, 1 = row).
# === Graded readers ===
readers_corpus[, 'Tree'] <- data.frame(apply(readers_corpus[, 'Tree'], 2, strip_parens))
readers_corpus[, 'Context tree'] <- data.frame(apply(readers_corpus[, 'Context tree'], 2, strip_parens))
# === Literary works ===
literature_corpus[, 'Tree'] <- data.frame(apply(literature_corpus[, 'Tree'], 2, strip_parens))
literature_corpus[, 'Context tree'] <- data.frame(apply(literature_corpus[, 'Context tree'], 2, strip_parens))
# === Reference corpus ===
reference_corpus[, 'Tree'] <- data.frame(apply(reference_corpus[, 'Tree'], 2, strip_parens))
reference_corpus[, 'Context tree'] <- data.frame(apply(reference_corpus[, 'Context tree'], 2, strip_parens))
# STEP 2. CREATE NEW COLUMNS AND DROP OLD ONES
# --------------------------------------------
new_tree_columns <- c('Min_Width', 'Max_Width', 'Avg_Width',
'Min_Depth', 'Max_Depth', 'Avg_Depth')
new_context_tree_columns <- c('Ctxt_Min_Width', 'Ctxt_Max_Width', 'Ctxt_Avg_Width',
'Ctxt_Min_Depth', 'Ctxt_Max_Depth', 'Ctxt_Avg_Depth')
# === Graded readers ===
readers_corpus[new_tree_columns] <- str_split_fixed(readers_corpus$Tree, ', ', 6)
readers_corpus[new_context_tree_columns] <- str_split_fixed(readers_corpus$`Context tree`, ', ', 6)
readers_corpus <- dplyr::select(readers_corpus, -c('Tree', 'Context tree'))
# === Literary works ===
literature_corpus[new_tree_columns] <- str_split_fixed(literature_corpus$Tree, ', ', 6)
literature_corpus[new_context_tree_columns] <- str_split_fixed(literature_corpus$`Context tree`, ', ', 6)
literature_corpus <- dplyr::select(literature_corpus, -c('Tree', 'Context tree'))
# === Reference corpus ===
reference_corpus[new_tree_columns] <- str_split_fixed(reference_corpus$Tree, ', ', 6)
reference_corpus[new_context_tree_columns] <- str_split_fixed(reference_corpus$`Context tree`, ', ', 6)
reference_corpus <- dplyr::select(reference_corpus, -c('Tree', 'Context tree'))
# STEP 3. CONVERT NEW COLUMNS TO NUMERIC TYPE VECTORS
# ---------------------------------------------------
# === Graded readers ===
readers_corpus[, new_tree_columns] <- apply(readers_corpus[, new_tree_columns], 2, function(x) as.numeric(x))
readers_corpus[, new_context_tree_columns] <- apply(readers_corpus[, new_context_tree_columns], 2,
function(x) as.numeric(x))
# === Literary works ===
literature_corpus[, new_tree_columns] <- apply(literature_corpus[, new_tree_columns], 2,
function(x) as.numeric(x))
literature_corpus[, new_context_tree_columns] <- apply(literature_corpus[, new_context_tree_columns], 2,
function(x) as.numeric(x))
# === Reference corpus ===
reference_corpus[, new_tree_columns] <- apply(reference_corpus[, new_tree_columns], 2,
function(x) as.numeric(x))
reference_corpus[, new_context_tree_columns] <- apply(reference_corpus[, new_context_tree_columns], 2,
function(x) as.numeric(x))
Last but not least, we are going to rename some columns so that they are easier to operate with.
# === Graded readers ===
readers_corpus <- readers_corpus %>%
rename(`Freq_A1A2` = `Freq A1-A2`,
`Freq_B1` = `Freq B1`,
`Freq_B2` = `Freq B2`,
`TFIDF_A1A2` = `TFIDF A1-A2`,
`TFIDF_B1` = `TFIDF B1`,
`TFIDF_B2` = `TFIDF B2`,
`Ctxt_Freq` = `Context freq`,
`Ctxt_TFIDF` = `Context TFIDF`,
`Freq_Markers` = `deprel-mark DOC%`,
`Freq_AdvCl` = `deprel-advcl DOC%`,
`Freq_XComp` = `deprel-xcomp DOC%`,
`Freq_CComp` = `deprel-ccomp DOC%`,
`Freq_ACl` = `deprel-acl DOC%`,
`Freq_NMod` = `deprel-nmod DOC%`,
`Freq_Conj` = `deprel-conj DOC%`,
`Freq_Appos` = `deprel-appos DOC%`,
`Freq_Past` = `upos-VERB-Tense-Past DOC%`,
`Freq_Pres` = `upos-VERB-Tense-Pres DOC%`,
`Freq_Fut` = `upos-VERB-Tense-Fut DOC%`,
`Freq_Imp` = `upos-VERB-Tense-Imp DOC%`,
`Freq_Ind` = `upos-VERB-Mood-Ind DOC%`,
`Freq_Sub` = `upos-VERB-Mood-Sub DOC%`,
`Freq_Cnd` = `upos-VERB-Mood-Cnd DOC%`,
`Density` = `Lexical density`)
# === Literary works ===
literature_corpus <- literature_corpus %>%
rename(`Freq_A1A2` = `Freq A1-A2`,
`Freq_B1` = `Freq B1`,
`Freq_B2` = `Freq B2`,
`TFIDF_A1A2` = `TFIDF A1-A2`,
`TFIDF_B1` = `TFIDF B1`,
`TFIDF_B2` = `TFIDF B2`,
`Ctxt_Freq` = `Context freq`,
`Ctxt_TFIDF` = `Context TFIDF`,
`Freq_Markers` = `deprel-mark DOC%`,
`Freq_AdvCl` = `deprel-advcl DOC%`,
`Freq_XComp` = `deprel-xcomp DOC%`,
`Freq_CComp` = `deprel-ccomp DOC%`,
`Freq_ACl` = `deprel-acl DOC%`,
`Freq_NMod` = `deprel-nmod DOC%`,
`Freq_Conj` = `deprel-conj DOC%`,
`Freq_Appos` = `deprel-appos DOC%`,
`Freq_Past` = `upos-VERB-Tense-Past DOC%`,
`Freq_Pres` = `upos-VERB-Tense-Pres DOC%`,
`Freq_Fut` = `upos-VERB-Tense-Fut DOC%`,
`Freq_Imp` = `upos-VERB-Tense-Imp DOC%`,
`Freq_Ind` = `upos-VERB-Mood-Ind DOC%`,
`Freq_Sub` = `upos-VERB-Mood-Sub DOC%`,
`Freq_Cnd` = `upos-VERB-Mood-Cnd DOC%`,
`Density` = `Lexical density`)
# === Reference corpus ===
reference_corpus <- reference_corpus %>%
rename(`Freq_A1A2` = `Freq A1-A2`,
`Freq_B1` = `Freq B1`,
`Freq_B2` = `Freq B2`,
`TFIDF_A1A2` = `TFIDF A1-A2`,
`TFIDF_B1` = `TFIDF B1`,
`TFIDF_B2` = `TFIDF B2`,
`Ctxt_Freq` = `Context freq`,
`Ctxt_TFIDF` = `Context TFIDF`,
`Freq_Markers` = `deprel-mark DOC%`,
`Freq_AdvCl` = `deprel-advcl DOC%`,
`Freq_XComp` = `deprel-xcomp DOC%`,
`Freq_CComp` = `deprel-ccomp DOC%`,
`Freq_ACl` = `deprel-acl DOC%`,
`Freq_NMod` = `deprel-nmod DOC%`,
`Freq_Conj` = `deprel-conj DOC%`,
`Freq_Appos` = `deprel-appos DOC%`,
`Freq_Past` = `upos-VERB-Tense-Past DOC%`,
`Freq_Pres` = `upos-VERB-Tense-Pres DOC%`,
`Freq_Fut` = `upos-VERB-Tense-Fut DOC%`,
`Freq_Imp` = `upos-VERB-Tense-Imp DOC%`,
`Freq_Ind` = `upos-VERB-Mood-Ind DOC%`,
`Freq_Sub` = `upos-VERB-Mood-Sub DOC%`,
`Freq_Cnd` = `upos-VERB-Mood-Cnd DOC%`,
`Density` = `Lexical density`)
COLUMN EXPLANATION
Let’s take a look at the data sets now!
summary(readers_corpus)
## Title Level Freq_A1A2 Freq_B1
## Length:50 Length:50 Min. :0.1138 Min. :0.04235
## Class :character Class :character 1st Qu.:0.1906 1st Qu.:0.05429
## Mode :character Mode :character Median :0.2106 Median :0.05909
## Mean :0.2084 Mean :0.06103
## 3rd Qu.:0.2309 3rd Qu.:0.06636
## Max. :0.2763 Max. :0.09866
## Freq_B2 TFIDF_A1A2 TFIDF_B1 TFIDF_B2
## Min. :0.01248 Min. :0.04578 Min. :0.01906 Min. :0.007626
## 1st Qu.:0.02144 1st Qu.:0.08192 1st Qu.:0.03690 1st Qu.:0.019693
## Median :0.02546 Median :0.09286 Median :0.04217 Median :0.024310
## Mean :0.02848 Mean :0.09454 Mean :0.04410 Mean :0.028034
## 3rd Qu.:0.03223 3rd Qu.:0.10722 3rd Qu.:0.05141 3rd Qu.:0.029478
## Max. :0.07377 Max. :0.14312 Max. :0.07531 Max. :0.089192
## Ctxt_Freq Ctxt_TFIDF Freq_Markers Freq_AdvCl
## Min. :0.0003962 Min. :0.0003924 Min. :0.00898 Min. :0.00651
## 1st Qu.:0.0007826 1st Qu.:0.0005900 1st Qu.:0.02218 1st Qu.:0.01850
## Median :0.0013098 Median :0.0008089 Median :0.02825 Median :0.02314
## Mean :0.0014201 Mean :0.0009030 Mean :0.03311 Mean :0.02477
## 3rd Qu.:0.0018667 3rd Qu.:0.0009965 3rd Qu.:0.04008 3rd Qu.:0.02956
## Max. :0.0033756 Max. :0.0024383 Max. :0.10017 Max. :0.04884
## Freq_XComp Freq_CComp Freq_ACl Freq_NMod
## Min. :0.002604 Min. :0.0008453 Min. :0.001559 Min. :0.00815
## 1st Qu.:0.010994 1st Qu.:0.0078364 1st Qu.:0.007381 1st Qu.:0.02439
## Median :0.013102 Median :0.0124764 Median :0.009757 Median :0.02905
## Mean :0.013646 Mean :0.0120540 Mean :0.010855 Mean :0.03131
## 3rd Qu.:0.016296 3rd Qu.:0.0153399 3rd Qu.:0.014058 3rd Qu.:0.03624
## Max. :0.027732 Max. :0.0321189 Max. :0.023438 Max. :0.07552
## Freq_Conj Freq_Appos Freq_Pres Freq_Past
## Min. :0.02023 Min. :0.000000 Min. :0.002604 Min. :0.0002351
## 1st Qu.:0.02718 1st Qu.:0.006202 1st Qu.:0.041965 1st Qu.:0.0094868
## Median :0.03126 Median :0.008568 Median :0.082327 Median :0.0212278
## Mean :0.03296 Mean :0.010089 Mean :0.074371 Mean :0.0261291
## 3rd Qu.:0.03806 3rd Qu.:0.013424 3rd Qu.:0.105707 3rd Qu.:0.0422629
## Max. :0.06891 Max. :0.034286 Max. :0.131641 Max. :0.0757431
## Freq_Fut Freq_Imp Freq_Ind Freq_Sub
## Min. :0.0000000 Min. :0.000000 Min. :0.0599 Min. :0.0000000
## 1st Qu.:0.0001297 1st Qu.:0.001783 1st Qu.:0.1035 1st Qu.:0.0006185
## Median :0.0014752 Median :0.011088 Median :0.1128 Median :0.0016107
## Mean :0.0020725 Mean :0.018601 Mean :0.1104 Mean :0.0034776
## 3rd Qu.:0.0031379 3rd Qu.:0.031652 3rd Qu.:0.1210 3rd Qu.:0.0050158
## Max. :0.0081425 Max. :0.064645 Max. :0.1481 Max. :0.0219371
## Freq_Cnd Density msttr mtld
## Min. :0.0000000 Min. :0.3874 Min. :0.6550 Min. : 53.36
## 1st Qu.:0.0000000 1st Qu.:0.4077 1st Qu.:0.7072 1st Qu.: 83.61
## Median :0.0004377 Median :0.4182 Median :0.7261 Median : 95.08
## Mean :0.0010487 Mean :0.4199 Mean :0.7227 Mean : 95.09
## 3rd Qu.:0.0012980 3rd Qu.:0.4269 3rd Qu.:0.7385 3rd Qu.:107.60
## Max. :0.0092402 Max. :0.4956 Max. :0.7658 Max. :127.83
## hdd Herdan Min_Width Max_Width
## Min. :0.8258 Min. :0.8082 Min. :1.00 Min. :11.00
## 1st Qu.:0.8481 1st Qu.:0.8417 1st Qu.:1.00 1st Qu.:15.00
## Median :0.8634 Median :0.8640 Median :2.00 Median :17.00
## Mean :0.8592 Mean :0.8606 Mean :1.72 Mean :18.32
## 3rd Qu.:0.8687 3rd Qu.:0.8766 3rd Qu.:2.00 3rd Qu.:21.50
## Max. :0.8845 Max. :0.9086 Max. :4.00 Max. :32.00
## Avg_Width Min_Depth Max_Depth Avg_Depth Ctxt_Min_Width
## Min. : 5.219 Min. :1.0 Min. : 5.00 Min. :3.106 Min. :1.0
## 1st Qu.: 6.021 1st Qu.:1.0 1st Qu.: 7.00 1st Qu.:3.436 1st Qu.:1.0
## Median : 6.967 Median :1.0 Median : 8.00 Median :4.058 Median :1.0
## Mean : 7.362 Mean :1.1 Mean : 9.66 Mean :4.277 Mean :1.5
## 3rd Qu.: 8.257 3rd Qu.:1.0 3rd Qu.:11.00 3rd Qu.:4.836 3rd Qu.:2.0
## Max. :11.956 Max. :2.0 Max. :20.00 Max. :7.005 Max. :4.0
## Ctxt_Max_Width Ctxt_Avg_Width Ctxt_Min_Depth Ctxt_Max_Depth Ctxt_Avg_Depth
## Min. :11.00 Min. :4.406 Min. :1.00 Min. : 5.00 Min. :2.476
## 1st Qu.:15.00 1st Qu.:5.047 1st Qu.:1.00 1st Qu.: 7.00 1st Qu.:2.725
## Median :17.00 Median :5.598 Median :1.00 Median : 8.00 Median :3.055
## Mean :18.32 Mean :6.031 Mean :1.08 Mean : 9.66 Mean :3.406
## 3rd Qu.:21.50 3rd Qu.:6.616 3rd Qu.:1.00 3rd Qu.:11.00 3rd Qu.:3.705
## Max. :32.00 Max. :9.845 Max. :2.00 Max. :20.00 Max. :5.501
summary(literature_corpus)
## Title Level Freq_A1A2 Freq_B1
## Length:50 Length:50 Min. :0.1136 Min. :0.04577
## Class :character Class :character 1st Qu.:0.1794 1st Qu.:0.05556
## Mode :character Mode :character Median :0.1904 Median :0.05977
## Mean :0.1920 Mean :0.06323
## 3rd Qu.:0.2047 3rd Qu.:0.06565
## Max. :0.2898 Max. :0.13870
## Freq_B2 TFIDF_A1A2 TFIDF_B1 TFIDF_B2
## Min. :0.01890 Min. :0.04379 Min. :0.02675 Min. :0.01218
## 1st Qu.:0.02767 1st Qu.:0.06263 1st Qu.:0.03314 1st Qu.:0.02141
## Median :0.03167 Median :0.07404 Median :0.03957 Median :0.02750
## Mean :0.03274 Mean :0.08005 Mean :0.04084 Mean :0.02817
## 3rd Qu.:0.03599 3rd Qu.:0.09266 3rd Qu.:0.04430 3rd Qu.:0.03447
## Max. :0.05742 Max. :0.13644 Max. :0.10853 Max. :0.05326
## Ctxt_Freq Ctxt_TFIDF Freq_Markers Freq_AdvCl
## Min. :0.0004093 Min. :0.0003712 Min. :0.02626 Min. :0.01377
## 1st Qu.:0.0005888 1st Qu.:0.0004827 1st Qu.:0.03710 1st Qu.:0.02558
## Median :0.0011018 Median :0.0007973 Median :0.04287 Median :0.02995
## Mean :0.0015707 Mean :0.0009023 Mean :0.04336 Mean :0.03137
## 3rd Qu.:0.0021075 3rd Qu.:0.0012378 3rd Qu.:0.04855 3rd Qu.:0.03394
## Max. :0.0047613 Max. :0.0024171 Max. :0.07274 Max. :0.05173
## Freq_XComp Freq_CComp Freq_ACl Freq_NMod
## Min. :0.005269 Min. :0.004918 Min. :0.003701 Min. :0.008239
## 1st Qu.:0.012109 1st Qu.:0.011840 1st Qu.:0.010534 1st Qu.:0.022233
## Median :0.015944 Median :0.014177 Median :0.013848 Median :0.025579
## Mean :0.015815 Mean :0.014730 Mean :0.013829 Mean :0.027059
## 3rd Qu.:0.018179 3rd Qu.:0.017407 3rd Qu.:0.017341 3rd Qu.:0.030829
## Max. :0.032590 Max. :0.027444 Max. :0.021779 Max. :0.053589
## Freq_Conj Freq_Appos Freq_Pres Freq_Past
## Min. :0.01119 Min. :0.001083 Min. :0.003279 Min. :0.009864
## 1st Qu.:0.02910 1st Qu.:0.002440 1st Qu.:0.021214 1st Qu.:0.042088
## Median :0.03228 Median :0.004649 Median :0.032619 Median :0.048117
## Mean :0.03279 Mean :0.004911 Mean :0.038143 Mean :0.047313
## 3rd Qu.:0.03687 3rd Qu.:0.005939 3rd Qu.:0.048881 3rd Qu.:0.059149
## Max. :0.06856 Max. :0.013388 Max. :0.138101 Max. :0.078007
## Freq_Fut Freq_Imp Freq_Ind Freq_Sub
## Min. :0.000000 Min. :0.00000 Min. :0.07315 Min. :0.0009794
## 1st Qu.:0.001155 1st Qu.:0.02588 1st Qu.:0.10459 1st Qu.:0.0036746
## Median :0.003112 Median :0.03885 Median :0.11307 Median :0.0051907
## Mean :0.003355 Mean :0.03794 Mean :0.11277 Mean :0.0055350
## 3rd Qu.:0.004448 3rd Qu.:0.04964 3rd Qu.:0.11988 3rd Qu.:0.0069880
## Max. :0.012330 Max. :0.06690 Max. :0.14673 Max. :0.0117270
## Freq_Cnd Density msttr mtld
## Min. :0.0000000 Min. :0.3925 Min. :0.6805 Min. : 73.51
## 1st Qu.:0.0007332 1st Qu.:0.4118 1st Qu.:0.7202 1st Qu.: 93.83
## Median :0.0014536 Median :0.4209 Median :0.7309 Median :101.42
## Mean :0.0017794 Mean :0.4240 Mean :0.7312 Mean :103.76
## 3rd Qu.:0.0026394 3rd Qu.:0.4351 3rd Qu.:0.7433 3rd Qu.:110.78
## Max. :0.0057773 Max. :0.4871 Max. :0.7967 Max. :154.88
## hdd Herdan Min_Width Max_Width
## Min. :0.8243 Min. :0.8337 Min. :1.00 Min. : 7.00
## 1st Qu.:0.8437 1st Qu.:0.8562 1st Qu.:2.00 1st Qu.:14.00
## Median :0.8537 Median :0.8678 Median :2.00 Median :18.00
## Mean :0.8550 Mean :0.8691 Mean :2.14 Mean :18.26
## 3rd Qu.:0.8659 3rd Qu.:0.8848 3rd Qu.:2.75 3rd Qu.:21.00
## Max. :0.8899 Max. :0.9171 Max. :5.00 Max. :33.00
## Avg_Width Min_Depth Max_Depth Avg_Depth Ctxt_Min_Width
## Min. : 4.172 Min. :1.00 Min. : 5.00 Min. :2.956 Min. :1.00
## 1st Qu.: 7.011 1st Qu.:1.00 1st Qu.: 8.25 1st Qu.:4.149 1st Qu.:1.00
## Median : 7.854 Median :1.00 Median :10.00 Median :4.733 Median :2.00
## Mean : 8.149 Mean :1.08 Mean :10.50 Mean :4.658 Mean :1.82
## 3rd Qu.: 9.255 3rd Qu.:1.00 3rd Qu.:11.00 3rd Qu.:5.129 3rd Qu.:2.00
## Max. :14.357 Max. :2.00 Max. :45.00 Max. :7.750 Max. :3.00
## Ctxt_Max_Width Ctxt_Avg_Width Ctxt_Min_Depth Ctxt_Max_Depth
## Min. : 7.00 Min. : 3.382 Min. :1.00 Min. : 5.00
## 1st Qu.:14.00 1st Qu.: 6.271 1st Qu.:1.00 1st Qu.: 8.25
## Median :18.00 Median : 6.823 Median :1.00 Median :10.00
## Mean :18.26 Mean : 6.834 Mean :1.02 Mean :10.50
## 3rd Qu.:21.00 3rd Qu.: 7.666 3rd Qu.:1.00 3rd Qu.:11.00
## Max. :33.00 Max. :11.036 Max. :2.00 Max. :45.00
## Ctxt_Avg_Depth
## Min. :2.418
## 1st Qu.:3.417
## Median :3.849
## Mean :3.841
## 3rd Qu.:4.204
## Max. :5.795
summary(reference_corpus)
## Freq_A1A2 Freq_B1 Freq_B2 TFIDF_A1A2
## Min. :0.0163 Min. :0.00000 Min. :0.00000 Min. :0.007464
## 1st Qu.:0.1102 1st Qu.:0.05202 1st Qu.:0.04096 1st Qu.:0.101886
## Median :0.1282 Median :0.06459 Median :0.05285 Median :0.127978
## Mean :0.1298 Mean :0.06546 Mean :0.05468 Mean :0.131676
## 3rd Qu.:0.1483 3rd Qu.:0.07826 3rd Qu.:0.06606 3rd Qu.:0.155976
## Max. :0.2590 Max. :0.20930 Max. :0.18750 Max. :0.303396
## TFIDF_B1 TFIDF_B2 Ctxt_Freq Ctxt_TFIDF
## Min. :0.00000 Min. :0.00000 Min. :0.001719 Min. :0.002539
## 1st Qu.:0.07567 1st Qu.:0.06533 1st Qu.:0.003947 1st Qu.:0.005356
## Median :0.09512 Median :0.08563 Median :0.004920 Median :0.006680
## Mean :0.09678 Mean :0.08808 Mean :0.006375 Mean :0.008078
## 3rd Qu.:0.11637 3rd Qu.:0.10737 3rd Qu.:0.006551 3rd Qu.:0.008613
## Max. :0.26066 Max. :0.24721 Max. :0.042411 Max. :0.048931
## Freq_Markers Freq_AdvCl Freq_XComp Freq_CComp
## Min. :0.00000 Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.02222 1st Qu.:0.009662 1st Qu.:0.003205 1st Qu.:0.002924
## Median :0.03315 Median :0.015773 Median :0.007335 Median :0.008287
## Mean :0.03442 Mean :0.016769 Mean :0.008506 Mean :0.010376
## 3rd Qu.:0.04589 3rd Qu.:0.022814 3rd Qu.:0.012526 3rd Qu.:0.015789
## Max. :0.17241 Max. :0.068182 Max. :0.047619 Max. :0.064516
## Freq_ACl Freq_NMod Freq_Conj Freq_Appos
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.01208 1st Qu.:0.04792 1st Qu.:0.02096 1st Qu.:0.00565
## Median :0.01724 Median :0.06468 Median :0.03030 Median :0.01149
## Mean :0.01802 Mean :0.06598 Mean :0.03246 Mean :0.01395
## 3rd Qu.:0.02317 3rd Qu.:0.08156 3rd Qu.:0.04073 3rd Qu.:0.01833
## Max. :0.06897 Max. :0.20000 Max. :0.11957 Max. :0.14674
## Freq_Pres Freq_Past Freq_Fut Freq_Imp
## Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.02148 1st Qu.:0.01544 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.03883 Median :0.02689 Median :0.000000 Median :0.003650
## Mean :0.04205 Mean :0.02876 Mean :0.004208 Mean :0.009057
## 3rd Qu.:0.05902 3rd Qu.:0.04075 3rd Qu.:0.005736 3rd Qu.:0.011682
## Max. :0.13548 Max. :0.07812 Max. :0.048276 Max. :0.088328
## Freq_Ind Freq_Sub Freq_Cnd Density
## Min. :0.00000 Min. :0.000000 Min. :0.000000 Min. :0.3112
## 1st Qu.:0.05581 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.4228
## Median :0.06838 Median :0.003731 Median :0.000000 Median :0.4396
## Mean :0.07018 Mean :0.005091 Mean :0.001587 Mean :0.4402
## 3rd Qu.:0.08203 3rd Qu.:0.007595 3rd Qu.:0.002336 3rd Qu.:0.4572
## Max. :0.14815 Max. :0.047619 Max. :0.066667 Max. :0.5667
## msttr mtld hdd Herdan
## Min. :0.4900 Min. : 21.00 Min. :0.6518 Min. :0.8355
## 1st Qu.:0.6783 1st Qu.: 67.99 1st Qu.:0.7979 1st Qu.:0.8890
## Median :0.7050 Median : 81.42 Median :0.8183 Median :0.9017
## Mean :0.7095 Mean : 85.19 Mean :0.8189 Mean :0.9031
## 3rd Qu.:0.7350 3rd Qu.: 97.89 3rd Qu.:0.8425 3rd Qu.:0.9153
## Max. :1.0000 Max. :202.39 Max. :1.0000 Max. :1.0000
## Min_Width Max_Width Avg_Width Min_Depth
## Min. : 1.000 Min. : 4.00 Min. : 3.800 Min. : 1.000
## 1st Qu.: 4.000 1st Qu.:13.00 1st Qu.: 9.198 1st Qu.: 2.000
## Median : 5.000 Median :16.00 Median :10.881 Median : 3.000
## Mean : 5.356 Mean :15.89 Mean :10.756 Mean : 3.328
## 3rd Qu.: 7.000 3rd Qu.:18.00 3rd Qu.:12.204 3rd Qu.: 4.000
## Max. :22.000 Max. :56.00 Max. :22.000 Max. :10.000
## Max_Depth Avg_Depth Ctxt_Min_Width Ctxt_Max_Width
## Min. : 2.000 Min. : 1.800 Min. : 1.000 Min. : 4.0
## 1st Qu.: 7.000 1st Qu.: 5.423 1st Qu.: 3.000 1st Qu.:13.0
## Median : 9.000 Median : 6.260 Median : 5.000 Median :16.0
## Mean : 8.834 Mean : 6.246 Mean : 5.164 Mean :15.9
## 3rd Qu.:10.000 3rd Qu.: 7.042 3rd Qu.: 6.000 3rd Qu.:18.0
## Max. :20.000 Max. :11.886 Max. :22.000 Max. :56.0
## Ctxt_Avg_Width Ctxt_Min_Depth Ctxt_Max_Depth Ctxt_Avg_Depth
## Min. : 3.667 Min. : 1.000 Min. : 2.000 Min. : 1.667
## 1st Qu.: 8.200 1st Qu.: 2.000 1st Qu.: 7.000 1st Qu.: 4.800
## Median : 9.778 Median : 3.000 Median : 9.000 Median : 5.700
## Mean : 9.752 Mean : 3.195 Mean : 8.834 Mean : 5.682
## 3rd Qu.:11.222 3rd Qu.: 4.000 3rd Qu.:10.000 3rd Qu.: 6.545
## Max. :22.000 Max. :10.000 Max. :20.000 Max. :10.500
The graded readers and the literary works tibbles have the exact same structure: 42 columns (variables) x 50 rows (works considered). The reference corpus has virtually the same columns, save for the first two (Title and Level), which are not there because we make no distinction among the texts composing the corpus. The number of rows, however, is different, since this corpus is made up of 609 files. Therefore, the resulting tibble has a structure of 40 columns x 609 rows.
DESCRIPTIVE STATISTICS
Occurring LI
Sankey charts depicting the relative frequency of graded vocabulary occurrences per corpus and lexical item level.
Parse trees’ properties
Violin plots depicting the properties of the parse trees of the sentences where the graded vocabulary and its context occur.
Relative frequencies of moods
Density ridgeline plots
## Picking joint bandwidth of 0.00694
## Picking joint bandwidth of 0.0014
## Picking joint bandwidth of 0.000354
## Picking joint bandwidth of 0.00443
## Picking joint bandwidth of 0.00121
## Picking joint bandwidth of 0.000596
## Picking joint bandwidth of 0.00488
## Picking joint bandwidth of 0.00141
## Picking joint bandwidth of 0.000435
ASSESSING MULTICOLLINEARITY
Pairwise Spearman correlations
par(mfrow=c(2,1))
# === GRADED READERS ===
readers_corr <- dplyr::select(readers_corpus, -c(1, 2))
readers_corr <- cor(readers_corr, method = "spearman")
readers_corr <- corrplot(readers_corr, type = "lower", order = "hclust",
title = "Graded readers", mar=c(0,0,1,0), tl.cex = 1,
tl.col = "black", tl.srt = 45, cex.main = 1.5)
readers_corr_matrix <- dplyr::select(readers_corpus, -c(1, 2)) %>% cor(method = "spearman")
readers_corr_long <- melt(readers_corr_matrix, varnames = c("Var1", "Var2"), value.name = "Correlation")
readers_corr_long <- readers_corr_long %>% filter(Var1 != Var2)
readers_corr_long <- readers_corr_long %>%
mutate(Var1_order = pmin(as.character(Var1), as.character(Var2)),
Var2_order = pmax(as.character(Var1), as.character(Var2))) %>%
distinct(Var1_order, Var2_order, .keep_all = TRUE)
readers_strongest_positive <- readers_corr_long %>% filter(Correlation > 0) %>%
arrange(desc(Correlation))
readers_strongest_negative <- readers_corr_long %>% filter(Correlation < 0) %>%
arrange(Correlation)
# === LITERARY WORKS ===
literature_corr <- dplyr::select(literature_corpus, -c(1, 2))
literature_corr <- cor(literature_corr, method = "spearman")
literature_corr <- corrplot(literature_corr, type = "lower", order = "hclust",
title = "Literary works", mar=c(0,0,1,0), tl.cex = 1,
tl.col = "black", tl.srt = 45, cex.main = 1.5)
literature_corr_matrix <- dplyr::select(literature_corpus, -c(1, 2)) %>% cor(method = "spearman")
literature_corr_long <- melt(literature_corr_matrix, varnames = c("Var1", "Var2"), value.name = "Correlation")
literature_corr_long <- literature_corr_long %>% filter(Var1 != Var2)
literature_corr_long <- literature_corr_long %>%
mutate(Var1_order = pmin(as.character(Var1), as.character(Var2)),
Var2_order = pmax(as.character(Var1), as.character(Var2))) %>%
distinct(Var1_order, Var2_order, .keep_all = TRUE)
literature_strongest_positive <- literature_corr_long %>% filter(Correlation > 0) %>%
arrange(desc(Correlation))
literature_strongest_negative <- literature_corr_long %>% filter(Correlation < 0) %>%
arrange(Correlation)
RANDOM FORESTS
# === GRADED READERS ===
# --- Step 1. Split the data into a training (70%) and testing (30%) sets using a stratified random
# sampling approach ---
set.seed(93)
readers_training <- readers_corpus %>%
group_by(Level) %>%
slice_sample(prop=0.7)
readers_testing <- anti_join(readers_corpus, readers_training, by = "Title")
# --- Step 2. Fit the random forest model ---
set.seed(1)
readers_model <- randomForest(Level ~ . - Title, data = readers_training, mtry=6, ntree=183,
importance=TRUE, proximity=TRUE)
readers_model
##
## Call:
## randomForest(formula = Level ~ . - Title, data = readers_training, mtry = 6, ntree = 183, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 183
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 18.18%
## Confusion matrix:
## Beginner Intermediate Advanced class.error
## Beginner 10 1 0 0.09090909
## Intermediate 2 10 1 0.23076923
## Advanced 0 2 7 0.22222222
# --- Step 3. Plot the out-of-bag error rate information ---
readers_obb_error_data <- data.frame(
Trees=rep(1:nrow(readers_model$err.rate)),
Type=rep(c("OOB", "Beginner", "Intermediate", "Advanced"), each=nrow(readers_model$err.rate)),
Error=c(readers_model$err.rate[, "OOB"],
readers_model$err.rate[, "Beginner"],
readers_model$err.rate[, "Intermediate"],
readers_model$err.rate[, "Advanced"])
)
readers_obb_error_data$Type <- factor(readers_obb_error_data$Type, levels = c("Beginner", "Intermediate", "Advanced", "OOB"))
ggplot(data=readers_obb_error_data, aes(x=Trees, y=Error)) +
geom_line(aes(color=Type)) +
ggtitle("Graded readers' out-of-bag (OOB) error rate information") +
theme(legend.key.size = unit(1, 'cm'),
legend.title = element_blank(),
legend.text = element_text(size=20)) +
theme(axis.text.x=element_text(size=15),
axis.title.x=element_text(size=20),
axis.text.y=element_text(size=18),
axis.title.y=element_text(size=20)) +
ylim(0, 0.9)
# --- Step 4. Predict classes for the test set ---
readers_predict <- predict(readers_model, readers_testing)
confusionMatrix(readers_predict, readers_testing$Level)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Beginner Intermediate Advanced
## Beginner 6 0 2
## Intermediate 0 4 1
## Advanced 0 2 2
##
## Overall Statistics
##
## Accuracy : 0.7059
## 95% CI : (0.4404, 0.8969)
## No Information Rate : 0.3529
## P-Value [Acc > NIR] : 0.003268
##
## Kappa : 0.555
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Beginner Class: Intermediate Class: Advanced
## Sensitivity 1.0000 0.6667 0.4000
## Specificity 0.8182 0.9091 0.8333
## Pos Pred Value 0.7500 0.8000 0.5000
## Neg Pred Value 1.0000 0.8333 0.7692
## Prevalence 0.3529 0.3529 0.2941
## Detection Rate 0.3529 0.2353 0.1176
## Detection Prevalence 0.4706 0.2941 0.2353
## Balanced Accuracy 0.9091 0.7879 0.6167
# --- Step 5. Let's plot the dot charts of variable importance ---
varImpPlot(readers_model,
sort = T,
n.var = 40,
main = "Graded readers' random forest variable importance")
# --- Step 6. Multi-dimensional Scaling Plot of Proximity Matrix ---
readers_distance.matrix <- dist(1-readers_model$proximity)
readers_mds.stuff <- cmdscale(readers_distance.matrix, eig=TRUE, x.ret=TRUE)
readers_mds.var.per <- round(readers_mds.stuff$eig/sum(readers_mds.stuff$eig)*100, 1)
readers_mds.values <- readers_mds.stuff$points
readers_mds.data <- data.frame(Sample=rownames(readers_mds.values),
X=readers_mds.values[,1],
Y=readers_mds.values[,2],
Status=readers_training$Level)
ggplot(data=readers_mds.data, aes(x=X, y=Y, label=Sample)) +
geom_text(size = 10, aes(color=Status)) +
theme_bw() +
xlab(paste("MDS1 - ", readers_mds.var.per[1], "%", sep="")) +
ylab(paste("MDS2 - ", readers_mds.var.per[2], "%", sep="")) +
ggtitle("GRs MDS plot (1 - RF proximities)") +
theme(legend.title = element_blank(),
legend.text = element_text(size=20)) +
theme(axis.text.x=element_text(size=15),
axis.title.x=element_text(size=20),
axis.text.y=element_text(size=18),
axis.title.y=element_text(size=20)) +
theme(plot.title = element_text(hjust = 0.5, size=30))
# === LITERARY WORKS ===
# --- Step 1. Split the data into a training (70%) and testing (30%) sets using a stratified random
# sampling approach ---
set.seed(45)
literature_training <- literature_corpus %>%
group_by(Level) %>%
slice_sample(prop=0.7)
literature_testing <- anti_join(literature_corpus, literature_training, by = "Title")
# --- Step 2. Fit the random forest model ---
set.seed(11)
literature_model <- randomForest(Level ~ . - Title, data = literature_training, mtry=6, ntree=131, importance=TRUE, proximity=TRUE)
literature_model
##
## Call:
## randomForest(formula = Level ~ . - Title, data = literature_training, mtry = 6, ntree = 131, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 131
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 17.14%
## Confusion matrix:
## Children's literature Young adult fiction
## Children's literature 19 2
## Young adult fiction 2 4
## Adult literature 0 1
## Adult literature class.error
## Children's literature 0 0.0952381
## Young adult fiction 1 0.4285714
## Adult literature 6 0.1428571
# --- Step 3. Plot the out-of-bag error rate information ---
literature_obb_error_data <- data.frame(
Trees=rep(1:nrow(literature_model$err.rate)),
Type=rep(c("OOB", "Children's literature", "Young adult fiction", "Adult literature"), each=nrow(literature_model$err.rate)),
Error=c(literature_model$err.rate[, "OOB"],
literature_model$err.rate[, "Children's literature"],
literature_model$err.rate[, "Young adult fiction"],
literature_model$err.rate[, "Adult literature"])
)
literature_obb_error_data$Type <- factor(literature_obb_error_data$Type, levels = c("Children's literature", "Young adult fiction", "Adult literature", "OOB"))
ggplot(data=literature_obb_error_data, aes(x=Trees, y=Error)) +
geom_line(aes(color=Type)) +
theme(legend.key.size = unit(1, 'cm'),
legend.title = element_blank(),
legend.text = element_text(size=20)) +
theme(axis.text.x=element_text(size=15),
axis.title.x=element_text(size=20),
axis.text.y=element_text(size=18),
axis.title.y=element_text(size=20)) +
ylim(0, 1)
# --- Step 4. Predict classes for the test set ---
literature_predict <- predict(literature_model, literature_testing)
confusionMatrix(literature_predict, literature_testing$Level)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Children's literature Young adult fiction
## Children's literature 7 1
## Young adult fiction 1 2
## Adult literature 1 0
## Reference
## Prediction Adult literature
## Children's literature 0
## Young adult fiction 1
## Adult literature 2
##
## Overall Statistics
##
## Accuracy : 0.7333
## 95% CI : (0.449, 0.9221)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.2173
##
## Kappa : 0.5455
##
## Mcnemar's Test P-Value : 0.5724
##
## Statistics by Class:
##
## Class: Children's literature Class: Young adult fiction
## Sensitivity 0.7778 0.6667
## Specificity 0.8333 0.8333
## Pos Pred Value 0.8750 0.5000
## Neg Pred Value 0.7143 0.9091
## Prevalence 0.6000 0.2000
## Detection Rate 0.4667 0.1333
## Detection Prevalence 0.5333 0.2667
## Balanced Accuracy 0.8056 0.7500
## Class: Adult literature
## Sensitivity 0.6667
## Specificity 0.9167
## Pos Pred Value 0.6667
## Neg Pred Value 0.9167
## Prevalence 0.2000
## Detection Rate 0.1333
## Detection Prevalence 0.2000
## Balanced Accuracy 0.7917
# --- Step 5. Let's plot the dot charts of variable importance ---
varImpPlot(literature_model,
sort = T,
n.var = 40,
main = "Literary works' random forest variable importance")
# --- Step 6. Multi-dimensional Scaling Plot of Proximity Matrix ---
literature_distance.matrix <- dist(1-literature_model$proximity)
literature_mds.stuff <- cmdscale(literature_distance.matrix, eig=TRUE, x.ret=TRUE)
literature_mds.var.per <- round(literature_mds.stuff$eig/sum(literature_mds.stuff$eig)*100, 1)
literature_mds.values <- literature_mds.stuff$points
literature_mds.data <- data.frame(Sample=rownames(literature_mds.values),
X=literature_mds.values[,1],
Y=literature_mds.values[,2],
Status=literature_training$Level)
ggplot(data=literature_mds.data, aes(x=X, y=Y, label=Sample)) +
geom_text(size = 10, aes(color=Status)) +
theme_bw() +
xlab(paste("MDS1 - ", literature_mds.var.per[1], "%", sep="")) +
ylab(paste("MDS2 - ", literature_mds.var.per[2], "%", sep="")) +
ggtitle("LWs MDS plot (1 - RF proximities)") +
theme(legend.title = element_blank(),
legend.text = element_text(size=20)) +
theme(axis.text.x=element_text(size=15),
axis.title.x=element_text(size=20),
axis.text.y=element_text(size=18),
axis.title.y=element_text(size=20)) +
theme(plot.title = element_text(hjust = 0.5, size=30))
# === BASELINE CLASSIFIERS ===
# --- 1. Zero-rate ---
# Find the most frequent class in readers_testing
most_frequent_class_readers <- names(which.max(table(readers_testing$Level)))
# Predict all instances as the most frequent class
readers_zero_rate_pred <- factor(rep(most_frequent_class_readers, nrow(readers_testing)),
levels = levels(readers_testing$Level))
# Evaluate performance
confusionMatrix(readers_zero_rate_pred, readers_testing$Level)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Beginner Intermediate Advanced
## Beginner 6 6 5
## Intermediate 0 0 0
## Advanced 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.3529
## 95% CI : (0.1421, 0.6167)
## No Information Rate : 0.3529
## P-Value [Acc > NIR] : 0.5903
##
## Kappa : 0
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Beginner Class: Intermediate Class: Advanced
## Sensitivity 1.0000 0.0000 0.0000
## Specificity 0.0000 1.0000 1.0000
## Pos Pred Value 0.3529 NaN NaN
## Neg Pred Value NaN 0.6471 0.7059
## Prevalence 0.3529 0.3529 0.2941
## Detection Rate 0.3529 0.0000 0.0000
## Detection Prevalence 1.0000 0.0000 0.0000
## Balanced Accuracy 0.5000 0.5000 0.5000
# Repeat for literature_testing
most_frequent_class_literature <- names(which.max(table(literature_testing$Level)))
literature_zero_rate_pred <- factor(rep(most_frequent_class_literature, nrow(literature_testing)),
levels = levels(literature_testing$Level))
confusionMatrix(literature_zero_rate_pred, literature_testing$Level)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Children's literature Young adult fiction
## Children's literature 9 3
## Young adult fiction 0 0
## Adult literature 0 0
## Reference
## Prediction Adult literature
## Children's literature 3
## Young adult fiction 0
## Adult literature 0
##
## Overall Statistics
##
## Accuracy : 0.6
## 95% CI : (0.3229, 0.8366)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.6098
##
## Kappa : 0
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Children's literature Class: Young adult fiction
## Sensitivity 1.0 0.0
## Specificity 0.0 1.0
## Pos Pred Value 0.6 NaN
## Neg Pred Value NaN 0.8
## Prevalence 0.6 0.2
## Detection Rate 0.6 0.0
## Detection Prevalence 1.0 0.0
## Balanced Accuracy 0.5 0.5
## Class: Adult literature
## Sensitivity 0.0
## Specificity 1.0
## Pos Pred Value NaN
## Neg Pred Value 0.8
## Prevalence 0.2
## Detection Rate 0.0
## Detection Prevalence 0.0
## Balanced Accuracy 0.5
# --- 2. Weighted guessing ---
set.seed(123)
# Compute class probabilities based on readers_training
class_probs_readers <- prop.table(table(readers_training$Level))
# Predict based on these probabilities
readers_weighted_pred <- factor(sample(names(class_probs_readers),
size = nrow(readers_testing),
replace = TRUE,
prob = class_probs_readers),
levels = levels(readers_testing$Level))
# Evaluate performance
confusionMatrix(readers_weighted_pred, readers_testing$Level)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Beginner Intermediate Advanced
## Beginner 3 3 1
## Intermediate 2 1 1
## Advanced 1 2 3
##
## Overall Statistics
##
## Accuracy : 0.4118
## 95% CI : (0.1844, 0.6708)
## No Information Rate : 0.3529
## P-Value [Acc > NIR] : 0.3912
##
## Kappa : 0.1192
##
## Mcnemar's Test P-Value : 0.9115
##
## Statistics by Class:
##
## Class: Beginner Class: Intermediate Class: Advanced
## Sensitivity 0.5000 0.16667 0.6000
## Specificity 0.6364 0.72727 0.7500
## Pos Pred Value 0.4286 0.25000 0.5000
## Neg Pred Value 0.7000 0.61538 0.8182
## Prevalence 0.3529 0.35294 0.2941
## Detection Rate 0.1765 0.05882 0.1765
## Detection Prevalence 0.4118 0.23529 0.3529
## Balanced Accuracy 0.5682 0.44697 0.6750
# Repeat for literature_testing
class_probs_literature <- prop.table(table(literature_training$Level))
literature_weighted_pred <- factor(sample(names(class_probs_literature),
size = nrow(literature_testing),
replace = TRUE,
prob = class_probs_literature),
levels = levels(literature_testing$Level))
confusionMatrix(literature_weighted_pred, literature_testing$Level)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Children's literature Young adult fiction
## Children's literature 2 3
## Young adult fiction 3 0
## Adult literature 4 0
## Reference
## Prediction Adult literature
## Children's literature 1
## Young adult fiction 2
## Adult literature 0
##
## Overall Statistics
##
## Accuracy : 0.1333
## 95% CI : (0.0166, 0.4046)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : -0.3542
##
## Mcnemar's Test P-Value : 0.2839
##
## Statistics by Class:
##
## Class: Children's literature Class: Young adult fiction
## Sensitivity 0.2222 0.0000
## Specificity 0.3333 0.5833
## Pos Pred Value 0.3333 0.0000
## Neg Pred Value 0.2222 0.7000
## Prevalence 0.6000 0.2000
## Detection Rate 0.1333 0.0000
## Detection Prevalence 0.4000 0.3333
## Balanced Accuracy 0.2778 0.2917
## Class: Adult literature
## Sensitivity 0.0000
## Specificity 0.6667
## Pos Pred Value 0.0000
## Neg Pred Value 0.7273
## Prevalence 0.2000
## Detection Rate 0.0000
## Detection Prevalence 0.2667
## Balanced Accuracy 0.3333
# --- 3. 1,000-fold randomization ---
set.seed(2)
n_iter <- 1000
metric_names <- c("Accuracy", "Kappa", "Sensitivity", "Specificity",
"Precision", "Recall", "F1", "Prevalence",
"Detection Rate", "Detection Prevalence", "Balanced Accuracy")
# --- For Graded Readers ---
levels_readers <- levels(readers_testing$Level)
# Create a nested list to store metrics per gradation level
random_metrics_readers <- list()
for (lev in levels_readers) {
random_metrics_readers[[lev]] <- list()
for (metric in metric_names) {
random_metrics_readers[[lev]][[metric]] <- numeric(n_iter)
}
}
# For each iteration, randomly shuffle predictions and compute one-vs-all confusion matrices
for (i in 1:n_iter) {
# Randomly shuffle class labels for the entire test set
random_labels <- sample(readers_testing$Level)
# For each gradation level, compute a binary confusion matrix
for (lev in levels_readers) {
# Create binary predictions: TRUE if the random label equals the current level, FALSE otherwise
pred_binary <- factor(random_labels == lev, levels = c(FALSE, TRUE))
actual_binary <- factor(readers_testing$Level == lev, levels = c(FALSE, TRUE))
# Compute confusion matrix for binary classification, specifying "TRUE" as the positive class
cm <- confusionMatrix(pred_binary, actual_binary, positive = "TRUE")
# Store overall metrics from cm$overall and per-class metrics from cm$byClass
random_metrics_readers[[lev]][["Accuracy"]][i] <- cm$overall["Accuracy"]
random_metrics_readers[[lev]][["Kappa"]][i] <- cm$overall["Kappa"]
random_metrics_readers[[lev]][["Sensitivity"]][i] <- cm$byClass["Sensitivity"]
random_metrics_readers[[lev]][["Specificity"]][i] <- cm$byClass["Specificity"]
random_metrics_readers[[lev]][["Precision"]][i] <- cm$byClass["Precision"]
random_metrics_readers[[lev]][["Recall"]][i] <- cm$byClass["Recall"]
random_metrics_readers[[lev]][["F1"]][i] <- cm$byClass["F1"]
random_metrics_readers[[lev]][["Prevalence"]][i] <- cm$byClass["Prevalence"]
random_metrics_readers[[lev]][["Detection Rate"]][i] <- cm$byClass["Detection Rate"]
random_metrics_readers[[lev]][["Detection Prevalence"]][i] <- cm$byClass["Detection Prevalence"]
random_metrics_readers[[lev]][["Balanced Accuracy"]][i] <- cm$byClass["Balanced Accuracy"]
}
}
# Compute summary statistics (mean and sd) for each metric per gradation
summary_random_readers <- lapply(random_metrics_readers, function(class_metrics) {
sapply(class_metrics, function(x) c(mean = mean(x), sd = sd(x)))
})
# --- For Literary Works ---
levels_literature <- levels(literature_testing$Level)
random_metrics_literature <- list()
for (lev in levels_literature) {
random_metrics_literature[[lev]] <- list()
for (metric in metric_names) {
random_metrics_literature[[lev]][[metric]] <- numeric(n_iter)
}
}
for (i in 1:n_iter) {
random_labels <- sample(literature_testing$Level)
for (lev in levels_literature) {
pred_binary <- factor(random_labels == lev, levels = c(FALSE, TRUE))
actual_binary <- factor(literature_testing$Level == lev, levels = c(FALSE, TRUE))
cm <- confusionMatrix(pred_binary, actual_binary, positive = "TRUE")
random_metrics_literature[[lev]][["Accuracy"]][i] <- cm$overall["Accuracy"]
random_metrics_literature[[lev]][["Kappa"]][i] <- cm$overall["Kappa"]
random_metrics_literature[[lev]][["Sensitivity"]][i] <- cm$byClass["Sensitivity"]
random_metrics_literature[[lev]][["Specificity"]][i] <- cm$byClass["Specificity"]
random_metrics_literature[[lev]][["Precision"]][i] <- cm$byClass["Precision"]
random_metrics_literature[[lev]][["Recall"]][i] <- cm$byClass["Recall"]
random_metrics_literature[[lev]][["F1"]][i] <- cm$byClass["F1"]
random_metrics_literature[[lev]][["Prevalence"]][i] <- cm$byClass["Prevalence"]
random_metrics_literature[[lev]][["Detection Rate"]][i] <- cm$byClass["Detection Rate"]
random_metrics_literature[[lev]][["Detection Prevalence"]][i] <- cm$byClass["Detection Prevalence"]
random_metrics_literature[[lev]][["Balanced Accuracy"]][i] <- cm$byClass["Balanced Accuracy"]
}
}
summary_random_literature <- lapply(random_metrics_literature, function(class_metrics) {
sapply(class_metrics, function(x) c(mean = mean(x), sd = sd(x)))
})
# --- Print the results ---
cat("\n**1,000-Fold Randomization Results (Graded Readers)**\n")
##
## **1,000-Fold Randomization Results (Graded Readers)**
for (lev in levels_readers) {
cat("\n--", lev, "--\n")
print(summary_random_readers[[lev]])
}
##
## -- Beginner --
## Accuracy Kappa Sensitivity Specificity Precision Recall F1
## mean 0.5417647 -0.003257576 0.3508333 0.64590909 0.3508333 0.3508333 NaN
## sd 0.1107922 0.242567703 0.1569556 0.08561213 0.1569556 0.1569556 NA
## Prevalence Detection Rate Detection Prevalence Balanced Accuracy
## mean 0.3529412 0.12382353 0.3529412 0.4983712
## sd 0.0000000 0.05539608 0.0000000 0.1212839
##
## -- Intermediate --
## Accuracy Kappa Sensitivity Specificity Precision Recall F1
## mean 0.5416471 -0.003515152 0.3506667 0.64581818 0.3506667 0.3506667 NaN
## sd 0.1138274 0.249213047 0.1612555 0.08795755 0.1612555 0.1612555 NA
## Prevalence Detection Rate Detection Prevalence Balanced Accuracy
## mean 0.3529412 0.12376471 0.3529412 0.4982424
## sd 0.0000000 0.05691371 0.0000000 0.1246065
##
## -- Advanced --
## Accuracy Kappa Sensitivity Specificity Precision Recall F1
## mean 0.5828235 -0.0047000 0.2908000 0.70450000 0.2908000 0.2908000 NaN
## sd 0.1019284 0.2454777 0.1732783 0.07219931 0.1732783 0.1732783 NA
## Prevalence Detection Rate Detection Prevalence Balanced Accuracy
## mean 0.2941176 0.08552941 0.2941176 0.4976500
## sd 0.0000000 0.05096422 0.0000000 0.1227388
cat("\n**1,000-Fold Randomization Results (Literary Works)**\n")
##
## **1,000-Fold Randomization Results (Literary Works)**
for (lev in levels_literature) {
cat("\n--", lev, "--\n")
print(summary_random_literature[[lev]])
}
##
## -- Children's literature --
## Accuracy Kappa Sensitivity Specificity Precision Recall F1
## mean 0.5164000 -0.0075000 0.5970000 0.3955000 0.5970000 0.5970000 0.5970000
## sd 0.1313751 0.2736981 0.1094792 0.1642188 0.1094792 0.1094792 0.1094792
## Prevalence Detection Rate Detection Prevalence Balanced Accuracy
## mean 0.6 0.35820000 0.6 0.496250
## sd 0.0 0.06568754 0.0 0.136849
##
## -- Young adult fiction --
## Accuracy Kappa Sensitivity Specificity Precision Recall F1
## mean 0.67840000 -0.0050000 0.1960000 0.79900000 0.1960000 0.1960000 NaN
## sd 0.08606692 0.2689591 0.2151673 0.05379182 0.2151673 0.2151673 NA
## Prevalence Detection Rate Detection Prevalence Balanced Accuracy
## mean 0.2 0.03920000 0.2 0.4975000
## sd 0.0 0.04303346 0.0 0.1344796
##
## -- Adult literature --
## Accuracy Kappa Sensitivity Specificity Precision Recall F1
## mean 0.68306667 0.009583333 0.2076667 0.80191667 0.2076667 0.2076667 NaN
## sd 0.08654271 0.270445971 0.2163568 0.05408919 0.2163568 0.2163568 NA
## Prevalence Detection Rate Detection Prevalence Balanced Accuracy
## mean 0.2 0.04153333 0.2 0.5047917
## sd 0.0 0.04327136 0.0 0.1352230
PERMUTATION TESTS
Using the following variables
# 1) AVERAGE DEPTH OF THE CONTEXT TREES
# === Step 1. Calculate the difference in sample means: ===
GR_mean_ctxt_depth_A1A2 <- mean(readers_corpus$Ctxt_Avg_Depth[readers_corpus$Level=="Beginner"])
GR_mean_ctxt_depth_B1 <- mean(readers_corpus$Ctxt_Avg_Depth[readers_corpus$Level=="Intermediate"])
GR_mean_ctxt_depth_B2 <- mean(readers_corpus$Ctxt_Avg_Depth[readers_corpus$Level=="Advanced"])
LW_mean_ctxt_depth_A1A2 <- mean(literature_corpus$Ctxt_Avg_Depth[literature_corpus$Level=="Children's literature"])
LW_mean_ctxt_depth_B1 <- mean(literature_corpus$Ctxt_Avg_Depth[literature_corpus$Level=="Young adult fiction"])
LW_mean_ctxt_depth_B2 <- mean(literature_corpus$Ctxt_Avg_Depth[literature_corpus$Level=="Adult literature"])
RC_mean_ctxt_depth <- mean(reference_corpus$Ctxt_Avg_Depth)
# === Step 2. Calculate the absolute difference in means ===
GR_LW_mean_ctxt_depth_stat_A1A2 <- abs(GR_mean_ctxt_depth_A1A2 - LW_mean_ctxt_depth_A1A2)
GR_LW_mean_ctxt_depth_stat_B1 <- abs(GR_mean_ctxt_depth_B1 - LW_mean_ctxt_depth_B1)
GR_LW_mean_ctxt_depth_stat_B2 <- abs(GR_mean_ctxt_depth_B2 - LW_mean_ctxt_depth_B2)
GR_RC_mean_ctxt_depth_stat_A1A2 <- abs(GR_mean_ctxt_depth_A1A2 - RC_mean_ctxt_depth)
GR_RC_mean_ctxt_depth_stat_B1 <- abs(GR_mean_ctxt_depth_B1 - RC_mean_ctxt_depth)
GR_RC_mean_ctxt_depth_stat_B2 <- abs(GR_mean_ctxt_depth_B2 - RC_mean_ctxt_depth)
LW_RC_mean_ctxt_depth_stat_A1A2 <- abs(LW_mean_ctxt_depth_A1A2 - RC_mean_ctxt_depth)
LW_RC_mean_ctxt_depth_stat_B1 <- abs(LW_mean_ctxt_depth_B1 - RC_mean_ctxt_depth)
LW_RC_mean_ctxt_depth_stat_B2 <- abs(LW_mean_ctxt_depth_B2 - RC_mean_ctxt_depth)
# === Step 3. Perform the permutation test ===
# AVERAGE DEPTH OF THE CONTEXT TREES
# Number of observations to sample
GR_n <- length(readers_corpus$Level)
LW_n <- length(literature_corpus$Level)
RC_n <- length(reference_corpus$group)
# Number of permutation samples to take
P <- 10000
# Variable we will resample from
GR_ctxt_depth_variable <- readers_corpus$Ctxt_Avg_Depth
LW_ctxt_depth_variable <- literature_corpus$Ctxt_Avg_Depth
RC_ctxt_depth_variable <- reference_corpus$Ctxt_Avg_Depth
# Initialize a matrix to store the permutation data. Each column is a permutation sample of data.
GR_ctxt_depth_PermSamples <- matrix(0, nrow=GR_n, ncol=P)
LW_ctxt_depth_PermSamples <- matrix(0, nrow=LW_n, ncol=P)
RC_ctxt_depth_PermSamples <- matrix(0, nrow=RC_n, ncol=P)
set.seed(1979)
for(i in 1:P){
GR_ctxt_depth_PermSamples[, i] <- sample(GR_ctxt_depth_variable, size=GR_n, replace=FALSE)
}
set.seed(1979)
for(i in 1:P){
LW_ctxt_depth_PermSamples[, i] <- sample(LW_ctxt_depth_variable, size=LW_n, replace=FALSE)
}
set.seed(1979)
for(i in 1:P){
RC_ctxt_depth_PermSamples[, i] <- sample(RC_ctxt_depth_variable, size=RC_n, replace=FALSE)
}
# Initialize vectors to store the test stats
Perm.test.stat_GR_LW_ctxt_depth_A1A2 <- rep(0, P)
Perm.test.stat_GR_LW_ctxt_depth_B1 <- rep(0, P)
Perm.test.stat_GR_LW_ctxt_depth_B2 <- rep(0, P)
Perm.test.stat_GR_RC_ctxt_depth_A1A2 <- rep(0, P)
Perm.test.stat_GR_RC_ctxt_depth_B1 <- rep(0, P)
Perm.test.stat_GR_RC_ctxt_depth_B2 <- rep(0, P)
Perm.test.stat_LW_RC_ctxt_depth_A1A2 <- rep(0, P)
Perm.test.stat_LW_RC_ctxt_depth_B1 <- rep(0, P)
Perm.test.stat_LW_RC_ctxt_depth_B2 <- rep(0, P)
# Loop through, and calculate test stats
for(i in 1:P){
Perm.test.stat_GR_LW_ctxt_depth_A1A2[i] <- abs(mean(GR_ctxt_depth_PermSamples[
readers_corpus$Level=="Beginner", i]) - mean(LW_ctxt_depth_PermSamples[
literature_corpus$Level=="Children's literature", i]))
}
for(i in 1:P){
Perm.test.stat_GR_LW_ctxt_depth_B1[i] <- abs(mean(GR_ctxt_depth_PermSamples[
readers_corpus$Level=="Intermediate", i]) - mean(LW_ctxt_depth_PermSamples[
literature_corpus$Level=="Young adult fiction", i]))
}
for(i in 1:P){
Perm.test.stat_GR_LW_ctxt_depth_B2[i] <- abs(mean(GR_ctxt_depth_PermSamples[
readers_corpus$Level=="Advanced", i]) - mean(LW_ctxt_depth_PermSamples[
literature_corpus$Level=="Adult literature", i]))
}
for(i in 1:P){
Perm.test.stat_GR_RC_ctxt_depth_A1A2[i] <- abs(mean(GR_ctxt_depth_PermSamples[
readers_corpus$Level=="Beginner", i]) - mean(RC_ctxt_depth_PermSamples[
reference_corpus$group==1, i]))
}
for(i in 1:P){
Perm.test.stat_GR_RC_ctxt_depth_B1[i] <- abs(mean(GR_ctxt_depth_PermSamples[
readers_corpus$Level=="Intermediate", i]) - mean(RC_ctxt_depth_PermSamples[
reference_corpus$group==1, i]))
}
for(i in 1:P){
Perm.test.stat_GR_RC_ctxt_depth_B2[i] <- abs(mean(GR_ctxt_depth_PermSamples[
readers_corpus$Level=="Advanced", i]) - mean(RC_ctxt_depth_PermSamples[
reference_corpus$group==1, i]))
}
# === Calculate the permutation p-value ===
# === GRs vs. LWs ===
# If there is no difference in the mean average depth of the context trees of beginner GRs and LWs, we would observe a test statistic of 1.091 or more by chance roughly 0% of the time.
mean(Perm.test.stat_GR_LW_ctxt_depth_A1A2 >= GR_LW_mean_ctxt_depth_stat_A1A2)
## [1] 0
# If there is no difference in the mean average depth of the context trees of intermediate GRs and LWs, we would observe a test statistic of 0.028 or more by chance roughly 98.14% of the time.
mean(Perm.test.stat_GR_LW_ctxt_depth_B1 >= GR_LW_mean_ctxt_depth_stat_B1)
## [1] 0.9814
# If there is no difference in the mean average depth of the context trees of advanced GRs and LWs, we would observe a test statistic of 0.262 or more by chance roughly 74.50% of the time.
mean(Perm.test.stat_GR_LW_ctxt_depth_B2 >= GR_LW_mean_ctxt_depth_stat_B2)
## [1] 0.745
# === GRs vs. RC ===
# If there is no difference in the mean average depth of the context trees of beginner GRs and the RC, we would observe a test statistic of 2.999 or more by chance roughly 0% of the time.
mean(Perm.test.stat_GR_RC_ctxt_depth_A1A2 >= GR_RC_mean_ctxt_depth_stat_A1A2)
## [1] 0
# If there is no difference in the mean average depth of the context trees of intermediate GRs and the RC, we would observe a test statistic of 2.158 or more by chance roughly 77.81% of the time.
mean(Perm.test.stat_GR_RC_ctxt_depth_B1 >= GR_RC_mean_ctxt_depth_stat_B1)
## [1] 0.7781
# If there is no difference in the mean average depth of the context trees of advanced GRs and the RC, we would observe a test statistic of 1.559 or more by chance roughly 99.97% of the time.
mean(Perm.test.stat_GR_RC_ctxt_depth_B2 >= GR_RC_mean_ctxt_depth_stat_B2)
## [1] 0.9997
# === LW vs. RC ===
# If there is no difference in the mean average depth of the context trees of children's LW and the RC, we would observe a test statistic of 1.908 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_ctxt_depth_A1A2 >= LW_RC_mean_ctxt_depth_stat_A1A2)
## [1] 0
# If there is no difference in the mean average depth of the context trees of young adult LW and the RC, we would observe a test statistic of 2.186 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_ctxt_depth_B1 >= LW_RC_mean_ctxt_depth_stat_B1)
## [1] 0
# If there is no difference in the mean average depth of the context trees of adult LW and the RC, we would observe a test statistic of 1.296 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_ctxt_depth_B2 >= LW_RC_mean_ctxt_depth_stat_B2)
## [1] 0
# --------------------
# 2) RELATIVE FREQUENCY OF THE SUBJUNCTIVE MOOD
# === Step 1. Calculate the difference in sample means: ===
GR_mean_sub_A1A2 <- mean(readers_corpus$Freq_Sub[readers_corpus$Level=="Beginner"])
GR_mean_sub_B1 <- mean(readers_corpus$Freq_Sub[readers_corpus$Level=="Intermediate"])
GR_mean_sub_B2 <- mean(readers_corpus$Freq_Sub[readers_corpus$Level=="Advanced"])
LW_mean_sub_A1A2 <- mean(literature_corpus$Freq_Sub[literature_corpus$Level=="Children's literature"])
LW_mean_sub_B1 <- mean(literature_corpus$Freq_Sub[literature_corpus$Level=="Young adult fiction"])
LW_mean_sub_B2 <- mean(literature_corpus$Freq_Sub[literature_corpus$Level=="Adult literature"])
RC_mean_sub <- mean(reference_corpus$Freq_Sub)
# === Step 2. Calculate the absolute difference in means ===
GR_LW_mean_sub_stat_A1A2 <- abs(GR_mean_sub_A1A2 - LW_mean_sub_A1A2)
GR_LW_mean_sub_stat_B1 <- abs(GR_mean_sub_B1 - LW_mean_sub_B1)
GR_LW_mean_sub_stat_B2 <- abs(GR_mean_sub_B2 - LW_mean_sub_B2)
GR_RC_mean_sub_stat_A1A2 <- abs(GR_mean_sub_A1A2 - RC_mean_sub)
GR_RC_mean_sub_stat_B1 <- abs(GR_mean_sub_B1 - RC_mean_sub)
GR_RC_mean_sub_stat_B2 <- abs(GR_mean_sub_B2 - RC_mean_sub)
LW_RC_mean_sub_stat_A1A2 <- abs(LW_mean_sub_A1A2 - RC_mean_sub)
LW_RC_mean_sub_stat_B1 <- abs(LW_mean_sub_B1 - RC_mean_sub)
LW_RC_mean_sub_stat_B2 <- abs(LW_mean_sub_B2 - RC_mean_sub)
# === Step 3. Perform the permutation test ===
# Variable we will resample from
GR_sub_variable <- readers_corpus$Freq_Sub
LW_sub_variable <- literature_corpus$Freq_Sub
RC_sub_variable <- reference_corpus$Freq_Sub
# Initialize a matrix to store the permutation data. Each column is a permutation sample of data.
GR_sub_PermSamples <- matrix(0, nrow=GR_n, ncol=P)
LW_sub_PermSamples <- matrix(0, nrow=LW_n, ncol=P)
RC_sub_PermSamples <- matrix(0, nrow=RC_n, ncol=P)
set.seed(1979)
for(i in 1:P){
GR_sub_PermSamples[, i] <- sample(GR_sub_variable, size=GR_n, replace=FALSE)
}
set.seed(1979)
for(i in 1:P){
LW_sub_PermSamples[, i] <- sample(LW_sub_variable, size=LW_n, replace=FALSE)
}
set.seed(1979)
for(i in 1:P){
RC_sub_PermSamples[, i] <- sample(RC_sub_variable, size=RC_n, replace=FALSE)
}
# Initialize vectors to store the test stats
Perm.test.stat_GR_LW_sub_A1A2 <- rep(0, P)
Perm.test.stat_GR_LW_sub_B1 <- rep(0, P)
Perm.test.stat_GR_LW_sub_B2 <- rep(0, P)
Perm.test.stat_GR_RC_sub_A1A2 <- rep(0, P)
Perm.test.stat_GR_RC_sub_B1 <- rep(0, P)
Perm.test.stat_GR_RC_sub_B2 <- rep(0, P)
Perm.test.stat_LW_RC_sub_A1A2 <- rep(0, P)
Perm.test.stat_LW_RC_sub_B1 <- rep(0, P)
Perm.test.stat_LW_RC_sub_B2 <- rep(0, P)
# Loop through, and calculate test stats
for(i in 1:P){
Perm.test.stat_GR_LW_sub_A1A2[i] <- abs(mean(GR_sub_PermSamples[
readers_corpus$Level=="Beginner", i]) - mean(LW_sub_PermSamples[
literature_corpus$Level=="Children's literature", i]))
}
for(i in 1:P){
Perm.test.stat_GR_LW_sub_B1[i] <- abs(mean(GR_sub_PermSamples[
readers_corpus$Level=="Intermediate", i]) - mean(LW_sub_PermSamples[
literature_corpus$Level=="Young adult fiction", i]))
}
for(i in 1:P){
Perm.test.stat_GR_LW_sub_B2[i] <- abs(mean(GR_sub_PermSamples[
readers_corpus$Level=="Advanced", i]) - mean(LW_sub_PermSamples[
literature_corpus$Level=="Adult literature", i]))
}
for(i in 1:P){
Perm.test.stat_GR_RC_sub_A1A2[i] <- abs(mean(GR_sub_PermSamples[
readers_corpus$Level=="Beginner", i]) - mean(RC_sub_PermSamples[
reference_corpus$group==1, i]))
}
for(i in 1:P){
Perm.test.stat_GR_RC_sub_B1[i] <- abs(mean(GR_sub_PermSamples[
readers_corpus$Level=="Intermediate", i]) - mean(RC_sub_PermSamples[
reference_corpus$group==1, i]))
}
for(i in 1:P){
Perm.test.stat_GR_RC_sub_B2[i] <- abs(mean(GR_sub_PermSamples[
readers_corpus$Level=="Advanced", i]) - mean(RC_sub_PermSamples[
reference_corpus$group==1, i]))
}
# === Calculate the permutation p-value ===
# === GRs vs. LWs ===
# If there is no difference in the mean average subjunctive frequencies of beginner GRs and LWs, we would observe a test statistic of 0.0039 or more by chance roughly 0.57% of the time.
mean(Perm.test.stat_GR_LW_sub_A1A2 >= GR_LW_mean_sub_stat_A1A2)
## [1] 0.0057
# If there is no difference in the mean average subjunctive frequencies of intermediate GRs and LWs, we would observe a test statistic of 0.002 or more by chance roughly 53.95% of the time.
mean(Perm.test.stat_GR_LW_sub_B1 >= GR_LW_mean_sub_stat_B1)
## [1] 0.5395
# If there is no difference in the mean average subjunctive frequencies of advanced GRs and LWs, we would observe a test statistic of 0.001 or more by chance roughly 77.23% of the time.
mean(Perm.test.stat_GR_LW_sub_B2 >= GR_LW_mean_sub_stat_B2)
## [1] 0.7723
# === GRs vs. RC ===
# If there is no difference in the mean average subjunctive frequencies of beginner GRs and RC, we would observe a test statistic of 0.0041 or more by chance roughly 0% of the time.
mean(Perm.test.stat_GR_RC_sub_A1A2 >= GR_RC_mean_sub_stat_A1A2)
## [1] 0
# If there is no difference in the mean average subjunctive frequencies of intermediate GRs and RC, we would observe a test statistic of 0.0014 or more by chance roughly 61.44% of the time.
mean(Perm.test.stat_GR_RC_sub_B1 >= GR_RC_mean_sub_stat_B1)
## [1] 0.6144
# If there is no difference in the mean average subjunctive frequencies of advanced GRs and RC, we would observe a test statistic of 0.0012 or more by chance roughly 68.73% of the time.
mean(Perm.test.stat_GR_RC_sub_B2 >= GR_RC_mean_sub_stat_B2)
## [1] 0.6873
# === LW vs. RC ===
# If there is no difference in the mean average subjunctive frequencies of beginner LW and RC, we would observe a test statistic of 0.00024 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_sub_A1A2 >= LW_RC_mean_sub_stat_A1A2)
## [1] 0
# If there is no difference in the mean average subjunctive frequencies of intermediate LW and RC, we would observe a test statistic of 0.0006 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_sub_B1 >= LW_RC_mean_sub_stat_B1)
## [1] 0
# If there is no difference in the mean average subjunctive frequencies of advanced LW and RC, we would observe a test statistic of 0.0023 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_sub_B2 >= LW_RC_mean_sub_stat_B2)
## [1] 0
# --------------------
# 3) RELATIVE FREQUENCY OF A1-A2 WORDS
# === Step 1. Calculate the difference in sample means: ===
GR_mean_A1A2_A1A2 <- mean(readers_corpus$Freq_A1A2[readers_corpus$Level=="Beginner"])
GR_mean_A1A2_B1 <- mean(readers_corpus$Freq_A1A2[readers_corpus$Level=="Intermediate"])
GR_mean_A1A2_B2 <- mean(readers_corpus$Freq_A1A2[readers_corpus$Level=="Advanced"])
LW_mean_A1A2_A1A2 <- mean(literature_corpus$Freq_A1A2[literature_corpus$Level=="Children's literature"])
LW_mean_A1A2_B1 <- mean(literature_corpus$Freq_A1A2[literature_corpus$Level=="Young adult fiction"])
LW_mean_A1A2_B2 <- mean(literature_corpus$Freq_A1A2[literature_corpus$Level=="Adult literature"])
RC_mean_A1A2 <- mean(reference_corpus$Freq_A1A2)
# === Step 2. Calculate the absolute difference in means ===
GR_LW_mean_A1A2_stat_A1A2 <- abs(GR_mean_A1A2_A1A2 - LW_mean_A1A2_A1A2)
GR_LW_mean_A1A2_stat_B1 <- abs(GR_mean_A1A2_B1 - LW_mean_A1A2_B1)
GR_LW_mean_A1A2_stat_B2 <- abs(GR_mean_A1A2_B2 - LW_mean_A1A2_B2)
GR_RC_mean_A1A2_stat_A1A2 <- abs(GR_mean_A1A2_A1A2 - RC_mean_A1A2)
GR_RC_mean_A1A2_stat_B1 <- abs(GR_mean_A1A2_B1 - RC_mean_A1A2)
GR_RC_mean_A1A2_stat_B2 <- abs(GR_mean_A1A2_B2 - RC_mean_A1A2)
LW_RC_mean_A1A2_stat_A1A2 <- abs(LW_mean_A1A2_A1A2 - RC_mean_A1A2)
LW_RC_mean_A1A2_stat_B1 <- abs(LW_mean_A1A2_B1 - RC_mean_A1A2)
LW_RC_mean_A1A2_stat_B2 <- abs(LW_mean_A1A2_B2 - RC_mean_A1A2)
# === Step 3. Perform the permutation test ===
# Variable we will resample from
GR_A1A2_variable <- readers_corpus$Freq_A1A2
LW_A1A2_variable <- literature_corpus$Freq_A1A2
RC_A1A2_variable <- reference_corpus$Freq_A1A2
# Initialize a matrix to store the permutation data. Each column is a permutation sample of data.
GR_A1A2_PermSamples <- matrix(0, nrow=GR_n, ncol=P)
LW_A1A2_PermSamples <- matrix(0, nrow=LW_n, ncol=P)
RC_A1A2_PermSamples <- matrix(0, nrow=RC_n, ncol=P)
set.seed(1979)
for(i in 1:P){
GR_A1A2_PermSamples[, i] <- sample(GR_A1A2_variable, size=GR_n, replace=FALSE)
}
set.seed(1979)
for(i in 1:P){
LW_A1A2_PermSamples[, i] <- sample(LW_A1A2_variable, size=LW_n, replace=FALSE)
}
set.seed(1979)
for(i in 1:P){
RC_A1A2_PermSamples[, i] <- sample(RC_A1A2_variable, size=RC_n, replace=FALSE)
}
# Initialize vectors to store the test stats
Perm.test.stat_GR_LW_A1A2_A1A2 <- rep(0, P)
Perm.test.stat_GR_LW_A1A2_B1 <- rep(0, P)
Perm.test.stat_GR_LW_A1A2_B2 <- rep(0, P)
Perm.test.stat_GR_RC_A1A2_A1A2 <- rep(0, P)
Perm.test.stat_GR_RC_A1A2_B1 <- rep(0, P)
Perm.test.stat_GR_RC_A1A2_B2 <- rep(0, P)
Perm.test.stat_LW_RC_A1A2_A1A2 <- rep(0, P)
Perm.test.stat_LW_RC_A1A2_B1 <- rep(0, P)
Perm.test.stat_LW_RC_A1A2_B2 <- rep(0, P)
# Loop through, and calculate test stats
for(i in 1:P){
Perm.test.stat_GR_LW_A1A2_A1A2[i] <- abs(mean(GR_A1A2_PermSamples[
readers_corpus$Level=="Beginner", i]) - mean(LW_A1A2_PermSamples[
literature_corpus$Level=="Children's literature", i]))
}
for(i in 1:P){
Perm.test.stat_GR_LW_A1A2_B1[i] <- abs(mean(GR_A1A2_PermSamples[
readers_corpus$Level=="Intermediate", i]) - mean(LW_A1A2_PermSamples[
literature_corpus$Level=="Young adult fiction", i]))
}
for(i in 1:P){
Perm.test.stat_GR_LW_A1A2_B2[i] <- abs(mean(GR_A1A2_PermSamples[
readers_corpus$Level=="Advanced", i]) - mean(LW_A1A2_PermSamples[
literature_corpus$Level=="Adult literature", i]))
}
for(i in 1:P){
Perm.test.stat_GR_RC_A1A2_A1A2[i] <- abs(mean(GR_A1A2_PermSamples[
readers_corpus$Level=="Beginner", i]) - mean(RC_A1A2_PermSamples[
reference_corpus$group==1, i]))
}
for(i in 1:P){
Perm.test.stat_GR_RC_A1A2_B1[i] <- abs(mean(GR_A1A2_PermSamples[
readers_corpus$Level=="Intermediate", i]) - mean(RC_A1A2_PermSamples[
reference_corpus$group==1, i]))
}
for(i in 1:P){
Perm.test.stat_GR_RC_A1A2_B2[i] <- abs(mean(GR_A1A2_PermSamples[
readers_corpus$Level=="Advanced", i]) - mean(RC_A1A2_PermSamples[
reference_corpus$group==1, i]))
}
# === Calculate the permutation p-value ===
# === GRs vs. LWs ===
# If there is no difference in the mean average A1-A2 word frequencies of beginner GRs and LWs, we would observe a test statistic of 0.033 or more by chance roughly 0.15% of the time.
mean(Perm.test.stat_GR_LW_A1A2_A1A2 >= GR_LW_mean_A1A2_stat_A1A2)
## [1] 0.0015
# If there is no difference in the mean average A1-A2 word frequencies of intermediate GRs and LWs, we would observe a test statistic of 0.014 or more by chance roughly 59.33% of the time.
mean(Perm.test.stat_GR_LW_A1A2_B1 >= GR_LW_mean_A1A2_stat_B1)
## [1] 0.5933
# If there is no difference in the mean average A1-A2 word frequencies of advanced GRs and LWs, we would observe a test statistic of 0.013 or more by chance roughly 62.89% of the time.
mean(Perm.test.stat_GR_LW_A1A2_B2 >= GR_LW_mean_A1A2_stat_B2)
## [1] 0.6289
# === GRs vs. RC ===
# If there is no difference in the mean average A1-A2 word frequencies of beginner GRs and the RC, we would observe a test statistic of 0.103 or more by chance roughly 0% of the time.
mean(Perm.test.stat_GR_RC_A1A2_A1A2 >= GR_RC_mean_A1A2_stat_A1A2)
## [1] 0
# If there is no difference in the mean average A1-A2 word frequencies of intermediate GRs and the RC, we would observe a test statistic of 0.079 or more by chance roughly 49.38% of the time.
mean(Perm.test.stat_GR_RC_A1A2_B1 >= GR_RC_mean_A1A2_stat_B1)
## [1] 0.4938
# If there is no difference in the mean average A1-A2 word frequencies of advanced GRs and the RC, we would observe a test statistic of 0.048 or more by chance roughly 100% of the time.
mean(Perm.test.stat_GR_RC_A1A2_B2 >= GR_RC_mean_A1A2_stat_B2)
## [1] 1
# === LW vs. RC ===
# If there is no difference in the mean average A1-A2 word frequencies of children LWs and the RC, we would observe a test statistic of 0.070 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_A1A2_A1A2 >= LW_RC_mean_A1A2_stat_A1A2)
## [1] 0
# If there is no difference in the mean average A1-A2 word frequencies of young adult LWs and the RC, we would observe a test statistic of 0.065 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_A1A2_B1 >= LW_RC_mean_A1A2_stat_B1)
## [1] 0
# If there is no difference in the mean average A1-A2 word frequencies of adult LW and the RC, we would observe a test statistic of 0.035 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_A1A2_B2 >= LW_RC_mean_A1A2_stat_B2)
## [1] 0
# --------------------
# 4) TFIDF OF THE GRADED VOCABULARY'S CONTEXT
# === Step 1. Calculate the difference in sample means: ===
GR_mean_ctxt_TFIDF_A1A2 <- mean(readers_corpus$Ctxt_TFIDF[readers_corpus$Level=="Beginner"])
GR_mean_ctxt_TFIDF_B1 <- mean(readers_corpus$Ctxt_TFIDF[readers_corpus$Level=="Intermediate"])
GR_mean_ctxt_TFIDF_B2 <- mean(readers_corpus$Ctxt_TFIDF[readers_corpus$Level=="Advanced"])
LW_mean_ctxt_TFIDF_A1A2 <- mean(literature_corpus$Ctxt_TFIDF[literature_corpus$Level=="Children's literature"])
LW_mean_ctxt_TFIDF_B1 <- mean(literature_corpus$Ctxt_TFIDF[literature_corpus$Level=="Young adult fiction"])
LW_mean_ctxt_TFIDF_B2 <- mean(literature_corpus$Ctxt_TFIDF[literature_corpus$Level=="Adult literature"])
RC_mean_ctxt_TFIDF <- mean(reference_corpus$Ctxt_TFIDF)
# === Step 2. Calculate the absolute difference in means ===
GR_LW_mean_ctxt_TFIDF_stat_A1A2 <- abs(GR_mean_ctxt_TFIDF_A1A2 - LW_mean_ctxt_TFIDF_A1A2)
GR_LW_mean_ctxt_TFIDF_stat_B1 <- abs(GR_mean_ctxt_TFIDF_B1 - LW_mean_ctxt_TFIDF_B1)
GR_LW_mean_ctxt_TFIDF_stat_B2 <- abs(GR_mean_ctxt_TFIDF_B2 - LW_mean_ctxt_TFIDF_B2)
GR_RC_mean_ctxt_TFIDF_stat_A1A2 <- abs(GR_mean_ctxt_TFIDF_A1A2 - RC_mean_ctxt_TFIDF)
GR_RC_mean_ctxt_TFIDF_stat_B1 <- abs(GR_mean_ctxt_TFIDF_B1 - RC_mean_ctxt_TFIDF)
GR_RC_mean_ctxt_TFIDF_stat_B2 <- abs(GR_mean_ctxt_TFIDF_B2 - RC_mean_ctxt_TFIDF)
LW_RC_mean_ctxt_TFIDF_stat_A1A2 <- abs(LW_mean_ctxt_TFIDF_A1A2 - RC_mean_ctxt_TFIDF)
LW_RC_mean_ctxt_TFIDF_stat_B1 <- abs(LW_mean_ctxt_TFIDF_B1 - RC_mean_ctxt_TFIDF)
LW_RC_mean_ctxt_TFIDF_stat_B2 <- abs(LW_mean_ctxt_TFIDF_B2 - RC_mean_ctxt_TFIDF)
# === Step 3. Perform the permutation test ===
# Variable we will resample from
GR_ctxt_TFIDF_variable <- readers_corpus$Ctxt_TFIDF
LW_ctxt_TFIDF_variable <- literature_corpus$Ctxt_TFIDF
RC_ctxt_TFIDF_variable <- reference_corpus$Ctxt_TFIDF
# Initialize a matrix to store the permutation data. Each column is a permutation sample of data.
GR_ctxt_TFIDF_PermSamples <- matrix(0, nrow=GR_n, ncol=P)
LW_ctxt_TFIDF_PermSamples <- matrix(0, nrow=LW_n, ncol=P)
RC_ctxt_TFIDF_PermSamples <- matrix(0, nrow=RC_n, ncol=P)
set.seed(1979)
for(i in 1:P){
GR_ctxt_TFIDF_PermSamples[, i] <- sample(GR_ctxt_TFIDF_variable, size=GR_n, replace=FALSE)
}
set.seed(1979)
for(i in 1:P){
LW_ctxt_TFIDF_PermSamples[, i] <- sample(LW_ctxt_TFIDF_variable, size=LW_n, replace=FALSE)
}
set.seed(1979)
for(i in 1:P){
RC_ctxt_TFIDF_PermSamples[, i] <- sample(RC_ctxt_TFIDF_variable, size=RC_n, replace=FALSE)
}
# Initialize vectors to store the test stats
Perm.test.stat_GR_LW_ctxt_TFIDF_A1A2 <- rep(0, P)
Perm.test.stat_GR_LW_ctxt_TFIDF_B1 <- rep(0, P)
Perm.test.stat_GR_LW_ctxt_TFIDF_B2 <- rep(0, P)
Perm.test.stat_GR_RC_ctxt_TFIDF_A1A2 <- rep(0, P)
Perm.test.stat_GR_RC_ctxt_TFIDF_B1 <- rep(0, P)
Perm.test.stat_GR_RC_ctxt_TFIDF_B2 <- rep(0, P)
Perm.test.stat_LW_RC_ctxt_TFIDF_A1A2 <- rep(0, P)
Perm.test.stat_LW_RC_ctxt_TFIDF_B1 <- rep(0, P)
Perm.test.stat_LW_RC_ctxt_TFIDF_B2 <- rep(0, P)
# Loop through, and calculate test stats
for(i in 1:P){
Perm.test.stat_GR_LW_ctxt_TFIDF_A1A2[i] <- abs(mean(GR_ctxt_TFIDF_PermSamples[
readers_corpus$Level=="Beginner", i]) - mean(LW_ctxt_TFIDF_PermSamples[
literature_corpus$Level=="Children's literature", i]))
}
for(i in 1:P){
Perm.test.stat_GR_LW_ctxt_TFIDF_B1[i] <- abs(mean(GR_ctxt_TFIDF_PermSamples[
readers_corpus$Level=="Intermediate", i]) - mean(LW_ctxt_TFIDF_PermSamples[
literature_corpus$Level=="Young adult fiction", i]))
}
for(i in 1:P){
Perm.test.stat_GR_LW_ctxt_TFIDF_B2[i] <- abs(mean(GR_ctxt_TFIDF_PermSamples[
readers_corpus$Level=="Advanced", i]) - mean(LW_ctxt_TFIDF_PermSamples[
literature_corpus$Level=="Adult literature", i]))
}
for(i in 1:P){
Perm.test.stat_GR_RC_ctxt_TFIDF_A1A2[i] <- abs(mean(GR_ctxt_TFIDF_PermSamples[
readers_corpus$Level=="Beginner", i]) - mean(RC_ctxt_TFIDF_PermSamples[
reference_corpus$group==1, i]))
}
for(i in 1:P){
Perm.test.stat_GR_RC_ctxt_TFIDF_B1[i] <- abs(mean(GR_ctxt_TFIDF_PermSamples[
readers_corpus$Level=="Intermediate", i]) - mean(RC_ctxt_TFIDF_PermSamples[
reference_corpus$group==1, i]))
}
for(i in 1:P){
Perm.test.stat_GR_RC_ctxt_TFIDF_B2[i] <- abs(mean(GR_ctxt_TFIDF_PermSamples[
readers_corpus$Level=="Advanced", i]) - mean(RC_ctxt_TFIDF_PermSamples[
reference_corpus$group==1, i]))
}
# === Calculate the permutation p-value ===
# === GRs vs. LWs ===
# If there is no difference in the mean average graded vocabulary's context TFIDF of beginner GRs and LWs, we would observe a test statistic of 0.00028 or more by chance roughly 0.86% of the time.
mean(Perm.test.stat_GR_LW_ctxt_TFIDF_A1A2 >= GR_LW_mean_ctxt_TFIDF_stat_A1A2)
## [1] 0.0086
# If there is no difference in the mean average graded vocabulary's context TFIDF of intermediate GRs and LWs, we would observe a test statistic of 0.00025 or more by chance roughly 11.41% of the time.
mean(Perm.test.stat_GR_LW_ctxt_TFIDF_B1 >= GR_LW_mean_ctxt_TFIDF_stat_B1)
## [1] 0.1141
# If there is no difference in the mean average graded vocabulary's context TFIDF of advanced GRs and LWs, we would observe a test statistic of 0.00053 or more by chance roughly 0.13% of the time.
mean(Perm.test.stat_GR_LW_ctxt_TFIDF_B2 >= GR_LW_mean_ctxt_TFIDF_stat_B2)
## [1] 0.0013
# === GRs vs. RC ===
# If there is no difference in the mean average graded vocabulary's context TFIDF of beginner GRs and the RC, we would observe a test statistic of 0.0072 or more by chance roughly 27.25% of the time.
mean(Perm.test.stat_GR_RC_ctxt_TFIDF_A1A2 >= GR_RC_mean_ctxt_TFIDF_stat_A1A2)
## [1] 0.2725
# If there is no difference in the mean average graded vocabulary's context TFIDF of intermediate GRs and the RC, we would observe a test statistic of 0.0072 or more by chance roughly 22.36% of the time.
mean(Perm.test.stat_GR_RC_ctxt_TFIDF_B1 >= GR_RC_mean_ctxt_TFIDF_stat_B1)
## [1] 0.2236
# If there is no difference in the mean average graded vocabulary's context TFIDF of advanced GRs and the RC, we would observe a test statistic of 0.0070 or more by chance roughly 92.58% of the time.
mean(Perm.test.stat_GR_RC_ctxt_TFIDF_B2 >= GR_RC_mean_ctxt_TFIDF_stat_B2)
## [1] 0.9258
# === LWs vs. RC ===
# If there is no difference in the mean average graded vocabulary's context TFIDF of beginner LWs and the RC, we would observe a test statistic of 0.0069 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_ctxt_TFIDF_A1A2 >= LW_RC_mean_ctxt_TFIDF_stat_A1A2)
## [1] 0
# If there is no difference in the mean average graded vocabulary's context TFIDF of intermediate LWs and the RC, we would observe a test statistic of 0.0075 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_ctxt_TFIDF_B1 >= LW_RC_mean_ctxt_TFIDF_stat_B1)
## [1] 0
# If there is no difference in the mean average graded vocabulary's context TFIDF of advanced LWs and the RC, we would observe a test statistic of 0.0076 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_ctxt_TFIDF_B2 >= LW_RC_mean_ctxt_TFIDF_stat_B2)
## [1] 0
# VISUALISATION OF THE MOST RELEVANT DIFFERENCES
# -- Average depth of the graded vocabulary context parse trees --
# Beginner vs. children's literature
# - Graded readers vs. literary works
# - Graded readers vs. reference corpus
# - Literary works vs. reference corpus
GR.LW.RC <- ggplot() +
geom_histogram(aes(x=readers_corpus$Ctxt_Avg_Depth[
readers_corpus$Level=="Beginner"], y=after_stat(density)), color = "white", fill = "#D14D72", bins = 100) +
geom_vline(aes(xintercept = GR_mean_ctxt_depth_A1A2), color = "black", linetype="dotdash", linewidth = 0.5) +
labs(x ='\nAverage depth of the parse trees of the sentences where the context of the graded vocabulary appears', y='Probability density\n', title = 'Comparison among all three corpora') +
geom_density(aes(x=readers_corpus$Ctxt_Avg_Depth[
readers_corpus$Level=="Beginner"]), color = "darkgray", linewidth = 1) +
annotate(geom="text", x=3.8, y=3.1, label="Beginner graded readers", size=5) +
geom_histogram(aes(x=literature_corpus$Ctxt_Avg_Depth[
literature_corpus$Level=="Children's literature"], y=after_stat(density)), color = "white", fill = "#FFABAB", alpha=0.6, bins = 100) +
geom_vline(aes(xintercept = LW_mean_ctxt_depth_A1A2), color = "black", linetype="dotdash", linewidth = 0.5) +
annotate(geom="text", x=4.7, y=1, label="Children's literature", size=5) +
geom_density(aes(x=literature_corpus$Ctxt_Avg_Depth[
literature_corpus$Level=="Children's literature"]), color = "darkgray", linewidth = 1) +
geom_histogram(aes(x=reference_corpus$Ctxt_Avg_Depth[
reference_corpus$group==1], y=after_stat(density)), color = "black", fill = "#FEF2F4", alpha=0.8, bins = 100) +
geom_vline(aes(xintercept = RC_mean_ctxt_depth), color = "black", linetype="dotdash", linewidth = 0.5) +
annotate(geom="text", x=7, y=0.6, label="Reference corpus", size=5) +
geom_density(aes(x=reference_corpus$Ctxt_Avg_Depth[
reference_corpus$group==1]),color = "darkgray", linewidth = 1) +
theme(plot.title = element_text(hjust = 0.5, size=18)) +
theme(axis.title.y = element_text(hjust = 0.5, size=14)) +
theme(axis.title.x = element_text(hjust = 0.5, size=14))
GR.LW.RC